Skip to content
Merged
30 changes: 30 additions & 0 deletions datafusion/functions/benches/replace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,36 @@ fn criterion_benchmark(c: &mut Criterion) {
}
}

// Empty-`from` path: insert `to` between every char of the input and at
// both ends.
if size == 1024 {
for &str_len in &[32_usize, 128] {
let args = create_args::<i32>(size, str_len, false, 0, 3, 0.0);
group.bench_function(
format!("replace_string_empty_from [size={size}, str_len={str_len}]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(invoke_replace_with_args(args_cloned, size))
})
},
);

let args = create_args::<i32>(size, str_len, true, 0, 3, 0.0);
group.bench_function(
format!(
"replace_string_view_empty_from [size={size}, str_len={str_len}]"
),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(invoke_replace_with_args(args_cloned, size))
})
},
);
}
}

group.finish();
}
}
Expand Down
84 changes: 42 additions & 42 deletions datafusion/functions/src/string/replace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
use arrow::buffer::NullBuffer;
use arrow::datatypes::DataType;

use crate::strings::GenericStringArrayBuilder;
use crate::strings::{
BulkNullStringArrayBuilder, GenericStringArrayBuilder, StringWriter,
};
use crate::utils::{make_scalar_function, utf8_to_str_type};
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::types::logical_string;
Expand Down Expand Up @@ -164,7 +166,6 @@ fn replace_view(args: &[ArrayRef]) -> Result<ArrayRef> {

let len = string_array.len();
let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(len, 0);
let mut buffer = String::new();
let nulls = NullBuffer::union_many([
string_array.nulls(),
from_array.nulls(),
Expand All @@ -184,19 +185,15 @@ fn replace_view(args: &[ArrayRef]) -> Result<ArrayRef> {
let string = unsafe { string_array.value_unchecked(i) };
let from = unsafe { from_array.value_unchecked(i) };
let to = unsafe { to_array.value_unchecked(i) };
buffer.clear();
replace_into_string(&mut buffer, string, from, to);
builder.append_value(&buffer);
apply_replace(&mut builder, string, from, to);
}
} else {
for i in 0..len {
// SAFETY: i < len, and no input has a null buffer.
let string = unsafe { string_array.value_unchecked(i) };
let from = unsafe { from_array.value_unchecked(i) };
let to = unsafe { to_array.value_unchecked(i) };
buffer.clear();
replace_into_string(&mut buffer, string, from, to);
builder.append_value(&buffer);
apply_replace(&mut builder, string, from, to);
}
}

Expand All @@ -212,7 +209,6 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {

let len = string_array.len();
let mut builder = GenericStringArrayBuilder::<T>::with_capacity(len, 0);
let mut buffer = String::new();
let nulls = NullBuffer::union_many([
string_array.nulls(),
from_array.nulls(),
Expand All @@ -232,71 +228,75 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string = unsafe { string_array.value_unchecked(i) };
let from = unsafe { from_array.value_unchecked(i) };
let to = unsafe { to_array.value_unchecked(i) };
buffer.clear();
replace_into_string(&mut buffer, string, from, to);
builder.append_value(&buffer);
apply_replace(&mut builder, string, from, to);
}
} else {
for i in 0..len {
// SAFETY: i < len, and no input has a null buffer.
let string = unsafe { string_array.value_unchecked(i) };
let from = unsafe { from_array.value_unchecked(i) };
let to = unsafe { to_array.value_unchecked(i) };
buffer.clear();
replace_into_string(&mut buffer, string, from, to);
builder.append_value(&buffer);
apply_replace(&mut builder, string, from, to);
}
}

Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
}

/// Helper function to perform string replacement into a reusable String buffer
#[inline]
fn replace_into_string(buffer: &mut String, string: &str, from: &str, to: &str) {
if from.is_empty() {
// When from is empty, insert 'to' at the beginning, between each character, and at the end
// This matches the behavior of str::replace()
buffer.push_str(to);
for ch in string.chars() {
buffer.push(ch);
buffer.push_str(to);
}
return;
}

// Fast path for replacing a single ASCII character with another single ASCII character.
// Extends the buffer's underlying Vec<u8> directly, for performance.
if let ([from_byte], [to_byte]) = (from.as_bytes(), to.as_bytes())
fn apply_replace<B: BulkNullStringArrayBuilder>(
builder: &mut B,
string: &str,
from: &str,
to: &str,
) {
// Hot path: single ASCII byte → single ASCII byte. An ASCII byte (< 0x80)
// cannot appear inside a multi-byte UTF-8 sequence, so any multi-byte
// sequences in `string` pass through unchanged and output stays valid
// UTF-8.
if let (&[from_byte], &[to_byte]) = (from.as_bytes(), to.as_bytes())
&& from_byte.is_ascii()
&& to_byte.is_ascii()
{
// SAFETY: Replacing an ASCII byte with another ASCII byte preserves UTF-8 validity.
// SAFETY: see the contract above.
unsafe {
buffer.as_mut_vec().extend(
string
.as_bytes()
.iter()
.map(|&b| if b == *from_byte { *to_byte } else { b }),
);
builder.append_byte_map(string.as_bytes(), |b| {
if b == from_byte { to_byte } else { b }
});
}
return;
}

if from.is_empty() {
// Empty `from`: insert `to` before each character and at both ends.
builder.append_with(|w| {
w.write_str(to);
for ch in string.chars() {
w.write_char(ch);
w.write_str(to);
}
});
return;
}

builder.append_with(|w| replace_into_writer(w, string, from, to));
}

#[inline]
fn replace_into_writer<W: StringWriter>(w: &mut W, string: &str, from: &str, to: &str) {
let mut last_end = 0;
for (start, _part) in string.match_indices(from) {
buffer.push_str(&string[last_end..start]);
buffer.push_str(to);
w.write_str(&string[last_end..start]);
w.write_str(to);
last_end = start + from.len();
}
buffer.push_str(&string[last_end..]);
w.write_str(&string[last_end..]);
}

#[cfg(test)]
mod tests {
use super::*;
use crate::utils::test::test_function;
use arrow::array::Array;
use arrow::array::LargeStringArray;
use arrow::array::StringArray;
use arrow::datatypes::DataType::{LargeUtf8, Utf8};
Expand Down
Loading
Loading