Skip to main content

arrow_string/
substring.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines kernel to extract a substring of an Array
19//! Supported array types:
20//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
21
22use arrow_array::builder::BufferBuilder;
23use arrow_array::types::*;
24use arrow_array::*;
25use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBuffer};
26use arrow_schema::{ArrowError, DataType};
27use num_traits::Zero;
28use std::cmp::Ordering;
29use std::sync::Arc;
30
31/// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
32///
33/// # Arguments
34///
35/// * `start` - The start index of all substrings.
36///   If `start >= 0`, then count from the start of the string,
37///   otherwise count from the end of the string.
38///
39/// * `length`(option) - The length of all substrings.
40///   If `length` is [None], then the substring is from `start` to the end of the string.
41///
42/// Attention: Both `start` and `length` are counted by byte, not by char.
43///
44/// # Basic usage
45/// ```
46/// # use arrow_array::StringArray;
47/// # use arrow_string::substring::substring;
48/// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]);
49/// let result = substring(&array, 1, Some(4)).unwrap();
50/// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
51/// assert_eq!(result, &StringArray::from(vec![Some("rrow"), None, Some("ust")]));
52/// ```
53///
54/// # Error
55/// - The function errors when the passed array is not a [`GenericStringArray`],
56///   [`GenericBinaryArray`], [`FixedSizeBinaryArray`] or [`DictionaryArray`]
57///   with supported array type as its value type.
58/// - The function errors if the offset of a substring in the input array is
59///   at invalid char boundary (only for \[Large\]String array).
60///   It is recommended to use [`substring_by_char`] if the input array may
61///   contain non-ASCII chars.
62///
63/// ## Example of trying to get an invalid utf-8 format substring
64/// ```
65/// # use arrow_array::StringArray;
66/// # use arrow_string::substring::substring;
67/// let array = StringArray::from(vec![Some("E=mc²")]);
68/// let error = substring(&array, 0, Some(5)).unwrap_err().to_string();
69/// assert!(error.contains("invalid utf-8 boundary"));
70/// ```
71pub fn substring(
72    array: &dyn Array,
73    start: i64,
74    length: Option<u64>,
75) -> Result<ArrayRef, ArrowError> {
76    macro_rules! substring_dict {
77        ($kt: ident, $($t: ident: $gt: ident), *) => {
78            match $kt.as_ref() {
79                $(
80                    &DataType::$t => {
81                        let dict = array
82                            .as_any()
83                            .downcast_ref::<DictionaryArray<$gt>>()
84                            .unwrap_or_else(|| {
85                                panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
86                                       stringify!($gt), array.data_type())
87                            });
88                        let values = substring(dict.values(), start, length)?;
89                        Ok(Arc::new(dict.with_values(values)))
90                    },
91                )*
92                    t => panic!("Unsupported dictionary key type: {}", t)
93            }
94        }
95    }
96
97    match array.data_type() {
98        DataType::Dictionary(kt, _) => {
99            substring_dict!(
100                kt,
101                Int8: Int8Type,
102                Int16: Int16Type,
103                Int32: Int32Type,
104                Int64: Int64Type,
105                UInt8: UInt8Type,
106                UInt16: UInt16Type,
107                UInt32: UInt32Type,
108                UInt64: UInt64Type
109            )
110        }
111        DataType::LargeBinary => byte_substring(
112            array
113                .as_any()
114                .downcast_ref::<LargeBinaryArray>()
115                .expect("A large binary is expected"),
116            start,
117            length.map(|e| e as i64),
118        ),
119        DataType::Binary => byte_substring(
120            array
121                .as_any()
122                .downcast_ref::<BinaryArray>()
123                .expect("A binary is expected"),
124            start as i32,
125            length.map(|e| e as i32),
126        ),
127        DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
128            array
129                .as_any()
130                .downcast_ref::<FixedSizeBinaryArray>()
131                .expect("a fixed size binary is expected"),
132            *old_len,
133            start as i32,
134            length.map(|e| e as i32),
135        ),
136        DataType::LargeUtf8 => byte_substring(
137            array
138                .as_any()
139                .downcast_ref::<LargeStringArray>()
140                .expect("A large string is expected"),
141            start,
142            length.map(|e| e as i64),
143        ),
144        DataType::Utf8 => byte_substring(
145            array
146                .as_any()
147                .downcast_ref::<StringArray>()
148                .expect("A string is expected"),
149            start as i32,
150            length.map(|e| e as i32),
151        ),
152        _ => Err(ArrowError::ComputeError(format!(
153            "substring does not support type {:?}",
154            array.data_type()
155        ))),
156    }
157}
158
159/// Substrings based on character index
160///
161/// # Arguments
162/// * `array` - The input string array
163///
164/// * `start` - The start index of all substrings.
165///   If `start >= 0`, then count from the start of the string,
166///   otherwise count from the end of the string.
167///
168/// * `length`(option) - The length of all substrings.
169///   If `length` is `None`, then the substring is from `start` to the end of the string.
170///
171/// Attention: Both `start` and `length` are counted by char.
172///
173/// # Performance
174///
175/// This function is slower than [substring]. Theoretically, the time complexity
176/// is `O(n)` where `n` is the length of the value buffer. It is recommended to
177/// use [substring] if the input array only contains ASCII chars.
178///
179/// # Basic usage
180/// ```
181/// # use arrow_array::StringArray;
182/// # use arrow_string::substring::substring_by_char;
183/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
184/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
185/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
186/// ```
187pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
188    array: &GenericStringArray<OffsetSize>,
189    start: i64,
190    length: Option<u64>,
191) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
192    let mut vals = BufferBuilder::<u8>::new({
193        let offsets = array.value_offsets();
194        (offsets[array.len()] - offsets[0]).to_usize().unwrap()
195    });
196    let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
197    new_offsets.append(OffsetSize::zero());
198    let length = length.map(|len| len.to_usize().unwrap());
199
200    array.iter().for_each(|val| {
201        if let Some(val) = val {
202            let char_count = val.chars().count();
203            let start = if start >= 0 {
204                start.to_usize().unwrap()
205            } else {
206                char_count - (-start).to_usize().unwrap().min(char_count)
207            };
208            let (start_offset, end_offset) = get_start_end_offset(val, start, length);
209            vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
210        }
211        new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
212    });
213    let offsets = OffsetBuffer::new(new_offsets.finish().into());
214    let values = vals.finish();
215    let nulls = array
216        .nulls()
217        .map(|n| n.inner().sliced())
218        .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
219    Ok(GenericStringArray::<OffsetSize>::new(
220        offsets, values, nulls,
221    ))
222}
223
224/// * `val` - string
225/// * `start` - the start char index of the substring
226/// * `length` - the char length of the substring
227///
228/// Return the `start` and `end` offset (by byte) of the substring
229fn get_start_end_offset(val: &str, start: usize, length: Option<usize>) -> (usize, usize) {
230    let len = val.len();
231    let mut offset_char_iter = val.char_indices();
232    let start_offset = offset_char_iter
233        .nth(start)
234        .map_or(len, |(offset, _)| offset);
235    let end_offset = length.map_or(len, |length| {
236        if length > 0 {
237            offset_char_iter
238                .nth(length - 1)
239                .map_or(len, |(offset, _)| offset)
240        } else {
241            start_offset
242        }
243    });
244    (start_offset, end_offset)
245}
246
247fn byte_substring<T: ByteArrayType>(
248    array: &GenericByteArray<T>,
249    start: T::Offset,
250    length: Option<T::Offset>,
251) -> Result<ArrayRef, ArrowError>
252where
253    <T as ByteArrayType>::Native: PartialEq,
254{
255    let offsets = array.value_offsets();
256    let data = array.value_data();
257    let zero = <T::Offset as Zero>::zero();
258
259    // When array is [Large]StringArray, we will check whether `offset` is at a valid char boundary.
260    let check_char_boundary = {
261        |offset: T::Offset| {
262            if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) {
263                return Ok(offset);
264            }
265            // Safety: a StringArray must contain valid UTF8 data
266            let data_str = unsafe { std::str::from_utf8_unchecked(data) };
267            let offset_usize = offset.as_usize();
268            if data_str.is_char_boundary(offset_usize) {
269                Ok(offset)
270            } else {
271                Err(ArrowError::ComputeError(format!(
272                    "The offset {offset_usize} is at an invalid utf-8 boundary."
273                )))
274            }
275        }
276    };
277
278    // start and end offsets of all substrings
279    let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len());
280    let mut new_offsets: Vec<T::Offset> = Vec::with_capacity(array.len() + 1);
281    let mut len_so_far = zero;
282    new_offsets.push(zero);
283
284    offsets
285        .windows(2)
286        .try_for_each(|pair| -> Result<(), ArrowError> {
287            let new_start = match start.cmp(&zero) {
288                Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
289                Ordering::Equal => pair[0],
290                Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
291            };
292            let new_end = match length {
293                Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
294                None => pair[1],
295            };
296            len_so_far += new_end - new_start;
297            new_starts_ends.push((new_start, new_end));
298            new_offsets.push(len_so_far);
299            Ok(())
300        })?;
301
302    // concatenate substrings into a buffer
303    let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
304
305    new_starts_ends
306        .iter()
307        .map(|(start, end)| {
308            let start = start.as_usize();
309            let end = end.as_usize();
310            &data[start..end]
311        })
312        .for_each(|slice| new_values.extend_from_slice(slice));
313
314    let offsets = OffsetBuffer::new(new_offsets.into());
315    let values = new_values.into();
316    let nulls = array
317        .nulls()
318        .map(|n| n.inner().sliced())
319        .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
320    Ok(Arc::new(GenericByteArray::<T>::new(offsets, values, nulls)))
321}
322
323fn fixed_size_binary_substring(
324    array: &FixedSizeBinaryArray,
325    old_len: i32,
326    start: i32,
327    length: Option<i32>,
328) -> Result<ArrayRef, ArrowError> {
329    let new_start = if start >= 0 {
330        start.min(old_len)
331    } else {
332        (old_len + start).max(0)
333    };
334    let new_len = match length {
335        Some(len) => len.min(old_len - new_start),
336        None => old_len - new_start,
337    };
338
339    // build value buffer
340    let num_of_elements = array.len();
341    let data = array.value_data();
342    let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize));
343    (0..num_of_elements)
344        .map(|idx| {
345            let offset = array.value_offset(idx);
346            (
347                (offset + new_start) as usize,
348                (offset + new_start + new_len) as usize,
349            )
350        })
351        .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
352
353    let mut nulls = array
354        .nulls()
355        .map(|n| n.inner().sliced())
356        .and_then(|b| NullBuffer::from_unsliced_buffer(b, num_of_elements));
357
358    if new_len == 0 && nulls.is_none() {
359        // FixedSizeBinaryArray::new takes length from the values buffer, except when size == 0.
360        // In that case it uses the null buffer length, so preserve the original length here.
361        // Example: ["", "", ""] -> substring(..., 1, Some(2)) should keep len=3;
362        // otherwise it collapses to an empty array (len=0).
363        nulls = Some(NullBuffer::new_valid(num_of_elements));
364    }
365
366    Ok(Arc::new(FixedSizeBinaryArray::new(
367        new_len,
368        new_values.into(),
369        nulls,
370    )))
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376    use arrow_buffer::BooleanBuffer;
377    use arrow_buffer::Buffer;
378
379    /// A helper macro to generate test cases.
380    /// # Arguments
381    /// * `input` - A vector which array can be built from.
382    /// * `start` - The start index of the substring.
383    /// * `len` - The length of the substring.
384    /// * `result` - The expected result of substring, which is a vector that array can be built from.
385    /// # Return
386    /// A vector of `(input, start, len, result)`.
387    ///
388    /// Users can provide any number of `(start, len, result)` to generate test cases for one `input`.
389    macro_rules! gen_test_cases {
390        ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => {
391            [
392                $(
393                    ($input.clone(), $start, $len, $result),
394                )*
395            ]
396        };
397    }
398
399    /// A helper macro to test the substring functions.
400    /// # Arguments
401    /// * `cases` - The test cases which is a vector of `(input, start, len, result)`.
402    ///   Please look at [`gen_test_cases`] to find how to generate it.
403    /// * `array_ty` - The array type.
404    /// * `substring_fn` - Either [`substring`] or [`substring_by_char`].
405    macro_rules! do_test {
406        ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
407            $cases
408                .into_iter()
409                .for_each(|(array, start, length, expected)| {
410                    let array = <$array_ty>::from(array);
411                    let result = $substring_fn(&array, start, length).unwrap();
412                    let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
413                    let expected = <$array_ty>::from(expected);
414                    assert_eq!(&expected, result);
415                })
416        };
417    }
418
419    fn with_nulls_generic_binary<O: OffsetSizeTrait>() {
420        let input = vec![
421            Some("hello".as_bytes()),
422            None,
423            Some(&[0xf8, 0xf9, 0xff, 0xfa]),
424        ];
425        // all-nulls array is always identical
426        let base_case = gen_test_cases!(
427            vec![None, None, None],
428            (-1, Some(1), vec![None, None, None])
429        );
430        let cases = gen_test_cases!(
431            input,
432            // identity
433            (0, None, input.clone()),
434            // 0 length -> Nothing
435            (0, Some(0), vec![Some(&[]), None, Some(&[])]),
436            // high start -> Nothing
437            (1000, Some(0), vec![Some(&[]), None, Some(&[])]),
438            // high negative start -> identity
439            (-1000, None, input.clone()),
440            // high length -> identity
441            (0, Some(1000), input.clone())
442        );
443
444        do_test!(
445            [&base_case[..], &cases[..]].concat(),
446            GenericBinaryArray<O>,
447            substring
448        );
449    }
450
451    #[test]
452    fn with_nulls_binary() {
453        with_nulls_generic_binary::<i32>()
454    }
455
456    #[test]
457    fn with_nulls_large_binary() {
458        with_nulls_generic_binary::<i64>()
459    }
460
461    fn without_nulls_generic_binary<O: OffsetSizeTrait>() {
462        let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]];
463        // empty array is always identical
464        let base_case = gen_test_cases!(
465            vec!["".as_bytes(), b"", b""],
466            (2, Some(1), vec!["".as_bytes(), b"", b""])
467        );
468        let cases = gen_test_cases!(
469            input,
470            // identity
471            (0, None, input.clone()),
472            // increase start
473            (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
474            (2, None, vec![b"llo", b"", &[0xff, 0xfa]]),
475            (3, None, vec![b"lo", b"", &[0xfa]]),
476            (10, None, vec![b"", b"", b""]),
477            // increase start negatively
478            (-1, None, vec![b"o", b"", &[0xfa]]),
479            (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]),
480            (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
481            (-10, None, input.clone()),
482            // increase length
483            (1, Some(1), vec![b"e", b"", &[0xf9]]),
484            (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]),
485            (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]),
486            (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
487            (-3, Some(1), vec![b"l", b"", &[0xf9]]),
488            (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]),
489            (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
490            (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]])
491        );
492
493        do_test!(
494            [&base_case[..], &cases[..]].concat(),
495            GenericBinaryArray<O>,
496            substring
497        );
498    }
499
500    #[test]
501    fn without_nulls_binary() {
502        without_nulls_generic_binary::<i32>()
503    }
504
505    #[test]
506    fn without_nulls_large_binary() {
507        without_nulls_generic_binary::<i64>()
508    }
509
510    fn generic_binary_with_non_zero_offset<O: OffsetSizeTrait>() {
511        let values = 0_u8..15;
512        let offsets = &[
513            O::zero(),
514            O::from_usize(5).unwrap(),
515            O::from_usize(10).unwrap(),
516            O::from_usize(15).unwrap(),
517        ];
518        // set the first and third element to be valid
519        let bitmap = [0b101_u8];
520
521        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
522        let values = Buffer::from_iter(values);
523        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
524            Buffer::from(bitmap),
525            0,
526            3,
527        )));
528        // array is `[null, [10, 11, 12, 13, 14]]`
529        let array = GenericBinaryArray::<O>::new(offsets, values, nulls).slice(1, 2);
530        // result is `[null, [11, 12, 13, 14]]`
531        let result = substring(&array, 1, None).unwrap();
532        let result = result
533            .as_any()
534            .downcast_ref::<GenericBinaryArray<O>>()
535            .unwrap();
536        let expected =
537            GenericBinaryArray::<O>::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]);
538        assert_eq!(result, &expected);
539    }
540
541    #[test]
542    fn binary_with_non_zero_offset() {
543        generic_binary_with_non_zero_offset::<i32>()
544    }
545
546    #[test]
547    fn large_binary_with_non_zero_offset() {
548        generic_binary_with_non_zero_offset::<i64>()
549    }
550
551    #[test]
552    fn with_nulls_fixed_size_binary() {
553        let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])];
554        // all-nulls array is always identical
555        let base_case =
556            gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None]));
557        let cases = gen_test_cases!(
558            input,
559            // identity
560            (0, None, input.clone()),
561            // increase start
562            (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
563            (2, None, vec![Some(b"t"), None, Some(&[0xff])]),
564            (3, None, vec![Some(b""), None, Some(b"")]),
565            (10, None, vec![Some(b""), None, Some(b"")]),
566            // increase start negatively
567            (-1, None, vec![Some(b"t"), None, Some(&[0xff])]),
568            (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
569            (-3, None, input.clone()),
570            (-10, None, input.clone()),
571            // increase length
572            (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]),
573            (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
574            (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
575            (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]),
576            (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]),
577            (-3, Some(3), input.clone()),
578            (-3, Some(4), input.clone())
579        );
580
581        do_test!(
582            [&base_case[..], &cases[..]].concat(),
583            FixedSizeBinaryArray,
584            substring
585        );
586    }
587
588    #[test]
589    fn without_nulls_fixed_size_binary() {
590        let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]];
591        // empty array is always identical
592        let base_case = gen_test_cases!(
593            vec!["".as_bytes(), &[], &[]],
594            (1, Some(2), vec!["".as_bytes(), &[], &[]])
595        );
596        let cases = gen_test_cases!(
597            input,
598            // identity
599            (0, None, input.clone()),
600            // increase start
601            (1, None, vec![b"at", b"og", &[0xf9, 0xff]]),
602            (2, None, vec![b"t", b"g", &[0xff]]),
603            (3, None, vec![&[], &[], &[]]),
604            (10, None, vec![&[], &[], &[]]),
605            // increase start negatively
606            (-1, None, vec![b"t", b"g", &[0xff]]),
607            (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]),
608            (-3, None, input.clone()),
609            (-10, None, input.clone()),
610            // increase length
611            (1, Some(1), vec![b"a", b"o", &[0xf9]]),
612            (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]),
613            (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]),
614            (-3, Some(1), vec![b"c", b"d", &[0xf8]]),
615            (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]),
616            (-3, Some(3), input.clone()),
617            (-3, Some(4), input.clone())
618        );
619
620        do_test!(
621            [&base_case[..], &cases[..]].concat(),
622            FixedSizeBinaryArray,
623            substring
624        );
625    }
626
627    #[test]
628    fn fixed_size_binary_with_non_zero_offset() {
629        let values: [u8; 15] = *b"hellotherearrow";
630        // set the first and third element to be valid
631        let bits_v = [0b101_u8];
632
633        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
634            Buffer::from(bits_v),
635            0,
636            3,
637        )));
638        // array is `[null, "arrow"]`
639        let array = FixedSizeBinaryArray::new(5, Buffer::from(&values), nulls).slice(1, 2);
640        // result is `[null, "rrow"]`
641        let result = substring(&array, 1, None).unwrap();
642        let result = result
643            .as_any()
644            .downcast_ref::<FixedSizeBinaryArray>()
645            .unwrap();
646        let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
647            vec![None, Some(b"rrow")].into_iter(),
648            4,
649        )
650        .unwrap();
651        assert_eq!(result, &expected);
652    }
653
654    fn with_nulls_generic_string<O: OffsetSizeTrait>() {
655        let input = vec![Some("hello"), None, Some("word")];
656        // all-nulls array is always identical
657        let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
658        let cases = gen_test_cases!(
659            input,
660            // identity
661            (0, None, input.clone()),
662            // 0 length -> Nothing
663            (0, Some(0), vec![Some(""), None, Some("")]),
664            // high start -> Nothing
665            (1000, Some(0), vec![Some(""), None, Some("")]),
666            // high negative start -> identity
667            (-1000, None, input.clone()),
668            // high length -> identity
669            (0, Some(1000), input.clone())
670        );
671
672        do_test!(
673            [&base_case[..], &cases[..]].concat(),
674            GenericStringArray<O>,
675            substring
676        );
677    }
678
679    #[test]
680    fn with_nulls_string() {
681        with_nulls_generic_string::<i32>()
682    }
683
684    #[test]
685    fn with_nulls_large_string() {
686        with_nulls_generic_string::<i64>()
687    }
688
689    fn without_nulls_generic_string<O: OffsetSizeTrait>() {
690        let input = vec!["hello", "", "word"];
691        // empty array is always identical
692        let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
693        let cases = gen_test_cases!(
694            input,
695            // identity
696            (0, None, input.clone()),
697            (1, None, vec!["ello", "", "ord"]),
698            (2, None, vec!["llo", "", "rd"]),
699            (3, None, vec!["lo", "", "d"]),
700            (10, None, vec!["", "", ""]),
701            // increase start negatively
702            (-1, None, vec!["o", "", "d"]),
703            (-2, None, vec!["lo", "", "rd"]),
704            (-3, None, vec!["llo", "", "ord"]),
705            (-10, None, input.clone()),
706            // increase length
707            (1, Some(1), vec!["e", "", "o"]),
708            (1, Some(2), vec!["el", "", "or"]),
709            (1, Some(3), vec!["ell", "", "ord"]),
710            (1, Some(4), vec!["ello", "", "ord"]),
711            (-3, Some(1), vec!["l", "", "o"]),
712            (-3, Some(2), vec!["ll", "", "or"]),
713            (-3, Some(3), vec!["llo", "", "ord"]),
714            (-3, Some(4), vec!["llo", "", "ord"])
715        );
716
717        do_test!(
718            [&base_case[..], &cases[..]].concat(),
719            GenericStringArray<O>,
720            substring
721        );
722    }
723
724    #[test]
725    fn without_nulls_string() {
726        without_nulls_generic_string::<i32>()
727    }
728
729    #[test]
730    fn without_nulls_large_string() {
731        without_nulls_generic_string::<i64>()
732    }
733
734    fn generic_string_with_non_zero_offset<O: OffsetSizeTrait>() {
735        let values = b"hellotherearrow";
736        let offsets = &[
737            O::zero(),
738            O::from_usize(5).unwrap(),
739            O::from_usize(10).unwrap(),
740            O::from_usize(15).unwrap(),
741        ];
742        // set the first and third element to be valid
743        let bitmap = [0b101_u8];
744
745        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
746        let values = Buffer::from(values);
747        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
748            Buffer::from(bitmap),
749            0,
750            3,
751        )));
752        // array is `[null, "arrow"]`
753        let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
754        // result is `[null, "rrow"]`
755        let result = substring(&array, 1, None).unwrap();
756        let result = result
757            .as_any()
758            .downcast_ref::<GenericStringArray<O>>()
759            .unwrap();
760        let expected = GenericStringArray::<O>::from(vec![None, Some("rrow")]);
761        assert_eq!(result, &expected);
762    }
763
764    #[test]
765    fn string_with_non_zero_offset() {
766        generic_string_with_non_zero_offset::<i32>()
767    }
768
769    #[test]
770    fn large_string_with_non_zero_offset() {
771        generic_string_with_non_zero_offset::<i64>()
772    }
773
774    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
775        let input = vec![Some("hello"), None, Some("Γ ⊢x:T")];
776        // all-nulls array is always identical
777        let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
778        let cases = gen_test_cases!(
779            input,
780            // identity
781            (0, None, input.clone()),
782            // 0 length -> Nothing
783            (0, Some(0), vec![Some(""), None, Some("")]),
784            // high start -> Nothing
785            (1000, Some(0), vec![Some(""), None, Some("")]),
786            // high negative start -> identity
787            (-1000, None, input.clone()),
788            // high length -> identity
789            (0, Some(1000), input.clone())
790        );
791
792        do_test!(
793            [&base_case[..], &cases[..]].concat(),
794            GenericStringArray<O>,
795            substring_by_char
796        );
797    }
798
799    #[test]
800    fn with_nulls_string_by_char() {
801        with_nulls_generic_string_by_char::<i32>()
802    }
803
804    #[test]
805    fn with_nulls_large_string_by_char() {
806        with_nulls_generic_string_by_char::<i64>()
807    }
808
809    fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
810        let input = vec!["hello", "", "Γ ⊢x:T"];
811        // empty array is always identical
812        let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
813        let cases = gen_test_cases!(
814            input,
815            //identity
816            (0, None, input.clone()),
817            // increase start
818            (1, None, vec!["ello", "", " ⊢x:T"]),
819            (2, None, vec!["llo", "", "⊢x:T"]),
820            (3, None, vec!["lo", "", "x:T"]),
821            (10, None, vec!["", "", ""]),
822            // increase start negatively
823            (-1, None, vec!["o", "", "T"]),
824            (-2, None, vec!["lo", "", ":T"]),
825            (-4, None, vec!["ello", "", "⊢x:T"]),
826            (-10, None, input.clone()),
827            // increase length
828            (1, Some(1), vec!["e", "", " "]),
829            (1, Some(2), vec!["el", "", " ⊢"]),
830            (1, Some(3), vec!["ell", "", " ⊢x"]),
831            (1, Some(6), vec!["ello", "", " ⊢x:T"]),
832            (-4, Some(1), vec!["e", "", "⊢"]),
833            (-4, Some(2), vec!["el", "", "⊢x"]),
834            (-4, Some(3), vec!["ell", "", "⊢x:"]),
835            (-4, Some(4), vec!["ello", "", "⊢x:T"])
836        );
837
838        do_test!(
839            [&base_case[..], &cases[..]].concat(),
840            GenericStringArray<O>,
841            substring_by_char
842        );
843    }
844
845    #[test]
846    fn without_nulls_string_by_char() {
847        without_nulls_generic_string_by_char::<i32>()
848    }
849
850    #[test]
851    fn without_nulls_large_string_by_char() {
852        without_nulls_generic_string_by_char::<i64>()
853    }
854
855    fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() {
856        let values = "S→T = Πx:S.T";
857        let offsets = &[
858            O::zero(),
859            O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(),
860            O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(),
861            O::from_usize(values.len()).unwrap(),
862        ];
863        // set the first and third element to be valid
864        let bitmap = [0b101_u8];
865
866        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
867        let values = Buffer::from(values.as_bytes());
868        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
869            Buffer::from(bitmap),
870            0,
871            3,
872        )));
873        // array is `[null, "Πx:S.T"]`
874        let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
875        // result is `[null, "x:S.T"]`
876        let result = substring_by_char(&array, 1, None).unwrap();
877        let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
878        assert_eq!(result, expected);
879    }
880
881    #[test]
882    fn string_with_non_zero_offset_by_char() {
883        generic_string_by_char_with_non_zero_offset::<i32>()
884    }
885
886    #[test]
887    fn large_string_with_non_zero_offset_by_char() {
888        generic_string_by_char_with_non_zero_offset::<i64>()
889    }
890
891    #[test]
892    fn dictionary() {
893        _dictionary::<Int8Type>();
894        _dictionary::<Int16Type>();
895        _dictionary::<Int32Type>();
896        _dictionary::<Int64Type>();
897        _dictionary::<UInt8Type>();
898        _dictionary::<UInt16Type>();
899        _dictionary::<UInt32Type>();
900        _dictionary::<UInt64Type>();
901    }
902
903    fn _dictionary<K: ArrowDictionaryKeyType>() {
904        const TOTAL: i32 = 100;
905
906        let v = ["aaa", "bbb", "ccc", "ddd", "eee"];
907        let data: Vec<Option<&str>> = (0..TOTAL)
908            .map(|n| {
909                let i = n % 5;
910                if i == 3 { None } else { Some(v[i as usize]) }
911            })
912            .collect();
913
914        let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
915
916        let expected: Vec<Option<&str>> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect();
917
918        let res = substring(&dict_array, 1, Some(2)).unwrap();
919        let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
920        let actual: Vec<Option<&str>> = actual
921            .values()
922            .as_any()
923            .downcast_ref::<GenericStringArray<i32>>()
924            .unwrap()
925            .take_iter(actual.keys_iter())
926            .collect();
927
928        for i in 0..TOTAL as usize {
929            assert_eq!(expected[i], actual[i],);
930        }
931    }
932
933    #[test]
934    fn check_invalid_array_type() {
935        let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
936        let err = substring(&array, 0, None).unwrap_err().to_string();
937        assert!(err.contains("substring does not support type"));
938    }
939
940    // tests for the utf-8 validation checking
941    #[test]
942    fn check_start_index() {
943        let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
944        let err = substring(&array, -1, None).unwrap_err().to_string();
945        assert!(err.contains("invalid utf-8 boundary"));
946    }
947
948    #[test]
949    fn check_length() {
950        let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
951        let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
952        assert!(err.contains("invalid utf-8 boundary"));
953    }
954
955    #[test]
956    fn non_utf8_bytes() {
957        // non-utf8 bytes
958        let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD];
959        let array = BinaryArray::from(vec![Some(bytes)]);
960        let arr = substring(&array, 0, Some(5)).unwrap();
961        let actual = arr.as_any().downcast_ref::<BinaryArray>().unwrap();
962
963        let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5];
964        let expected = BinaryArray::from(vec![Some(expected_bytes)]);
965        assert_eq!(expected, *actual);
966    }
967}