Skip to main content

arrow_string/
substring.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines kernel to extract a substring of an Array
19//! Supported array types:
20//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
21
22use arrow_array::builder::BufferBuilder;
23use arrow_array::types::*;
24use arrow_array::*;
25use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBuffer};
26use arrow_schema::{ArrowError, DataType};
27use num_traits::Zero;
28use std::cmp::Ordering;
29use std::sync::Arc;
30
31/// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
32///
33/// # Arguments
34///
35/// * `start` - The start index of all substrings.
36///   If `start >= 0`, then count from the start of the string,
37///   otherwise count from the end of the string.
38///
39/// * `length`(option) - The length of all substrings.
40///   If `length` is [None], then the substring is from `start` to the end of the string.
41///
42/// Attention: Both `start` and `length` are counted by byte, not by char.
43///
44/// # Basic usage
45/// ```
46/// # use arrow_array::StringArray;
47/// # use arrow_string::substring::substring;
48/// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]);
49/// let result = substring(&array, 1, Some(4)).unwrap();
50/// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
51/// assert_eq!(result, &StringArray::from(vec![Some("rrow"), None, Some("ust")]));
52/// ```
53///
54/// # Error
55/// - The function errors when the passed array is not a [`GenericStringArray`],
56///   [`GenericBinaryArray`], [`FixedSizeBinaryArray`] or [`DictionaryArray`]
57///   with supported array type as its value type.
58/// - The function errors if the offset of a substring in the input array is
59///   at invalid char boundary (only for \[Large\]String array).
60///   It is recommended to use [`substring_by_char`] if the input array may
61///   contain non-ASCII chars.
62///
63/// ## Example of trying to get an invalid utf-8 format substring
64/// ```
65/// # use arrow_array::StringArray;
66/// # use arrow_string::substring::substring;
67/// let array = StringArray::from(vec![Some("E=mc²")]);
68/// let error = substring(&array, 0, Some(5)).unwrap_err().to_string();
69/// assert!(error.contains("invalid utf-8 boundary"));
70/// ```
71pub fn substring(
72    array: &dyn Array,
73    start: i64,
74    length: Option<u64>,
75) -> Result<ArrayRef, ArrowError> {
76    macro_rules! substring_dict {
77        ($kt: ident, $($t: ident: $gt: ident), *) => {
78            match $kt.as_ref() {
79                $(
80                    &DataType::$t => {
81                        let dict = array
82                            .as_any()
83                            .downcast_ref::<DictionaryArray<$gt>>()
84                            .unwrap_or_else(|| {
85                                panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
86                                       stringify!($gt), array.data_type())
87                            });
88                        let values = substring(dict.values(), start, length)?;
89                        Ok(Arc::new(dict.with_values(values)))
90                    },
91                )*
92                    t => panic!("Unsupported dictionary key type: {}", t)
93            }
94        }
95    }
96
97    match array.data_type() {
98        DataType::Dictionary(kt, _) => {
99            substring_dict!(
100                kt,
101                Int8: Int8Type,
102                Int16: Int16Type,
103                Int32: Int32Type,
104                Int64: Int64Type,
105                UInt8: UInt8Type,
106                UInt16: UInt16Type,
107                UInt32: UInt32Type,
108                UInt64: UInt64Type
109            )
110        }
111        DataType::LargeBinary => byte_substring(
112            array
113                .as_any()
114                .downcast_ref::<LargeBinaryArray>()
115                .expect("A large binary is expected"),
116            start,
117            length.map(|e| e as i64),
118        ),
119        DataType::Binary => byte_substring(
120            array
121                .as_any()
122                .downcast_ref::<BinaryArray>()
123                .expect("A binary is expected"),
124            start as i32,
125            length.map(|e| e as i32),
126        ),
127        DataType::FixedSizeBinary(old_len) => {
128            let old_len: usize = (*old_len)
129                .try_into()
130                .expect("negative FixedSizeBinary value length");
131            fixed_size_binary_substring(
132                array
133                    .as_any()
134                    .downcast_ref::<FixedSizeBinaryArray>()
135                    .expect("a fixed size binary is expected"),
136                old_len,
137                start,
138                length,
139            )
140        }
141        DataType::LargeUtf8 => byte_substring(
142            array
143                .as_any()
144                .downcast_ref::<LargeStringArray>()
145                .expect("A large string is expected"),
146            start,
147            length.map(|e| e as i64),
148        ),
149        DataType::Utf8 => byte_substring(
150            array
151                .as_any()
152                .downcast_ref::<StringArray>()
153                .expect("A string is expected"),
154            start as i32,
155            length.map(|e| e as i32),
156        ),
157        _ => Err(ArrowError::ComputeError(format!(
158            "substring does not support type {:?}",
159            array.data_type()
160        ))),
161    }
162}
163
164/// Substrings based on character index
165///
166/// # Arguments
167/// * `array` - The input string array
168///
169/// * `start` - The start index of all substrings.
170///   If `start >= 0`, then count from the start of the string,
171///   otherwise count from the end of the string.
172///
173/// * `length`(option) - The length of all substrings.
174///   If `length` is `None`, then the substring is from `start` to the end of the string.
175///
176/// Attention: Both `start` and `length` are counted by char.
177///
178/// # Performance
179///
180/// This function is slower than [substring]. Theoretically, the time complexity
181/// is `O(n)` where `n` is the length of the value buffer. It is recommended to
182/// use [substring] if the input array only contains ASCII chars.
183///
184/// # Basic usage
185/// ```
186/// # use arrow_array::StringArray;
187/// # use arrow_string::substring::substring_by_char;
188/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
189/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
190/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
191/// ```
192pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
193    array: &GenericStringArray<OffsetSize>,
194    start: i64,
195    length: Option<u64>,
196) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
197    let mut vals = BufferBuilder::<u8>::new({
198        let offsets = array.value_offsets();
199        (offsets[array.len()] - offsets[0]).to_usize().unwrap()
200    });
201    let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
202    new_offsets.append(OffsetSize::zero());
203    let length = length.map(|len| len.to_usize().unwrap());
204
205    array.iter().for_each(|val| {
206        if let Some(val) = val {
207            let char_count = val.chars().count();
208            let start = if start >= 0 {
209                start.to_usize().unwrap()
210            } else {
211                char_count - (-start).to_usize().unwrap().min(char_count)
212            };
213            let (start_offset, end_offset) = get_start_end_offset(val, start, length);
214            vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
215        }
216        new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
217    });
218    let offsets = OffsetBuffer::new(new_offsets.finish().into());
219    let values = vals.finish();
220    let nulls = array
221        .nulls()
222        .map(|n| n.inner().sliced())
223        .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
224    Ok(GenericStringArray::<OffsetSize>::new(
225        offsets, values, nulls,
226    ))
227}
228
229/// * `val` - string
230/// * `start` - the start char index of the substring
231/// * `length` - the char length of the substring
232///
233/// Return the `start` and `end` offset (by byte) of the substring
234fn get_start_end_offset(val: &str, start: usize, length: Option<usize>) -> (usize, usize) {
235    let len = val.len();
236    let mut offset_char_iter = val.char_indices();
237    let start_offset = offset_char_iter
238        .nth(start)
239        .map_or(len, |(offset, _)| offset);
240    let end_offset = length.map_or(len, |length| {
241        if length > 0 {
242            offset_char_iter
243                .nth(length - 1)
244                .map_or(len, |(offset, _)| offset)
245        } else {
246            start_offset
247        }
248    });
249    (start_offset, end_offset)
250}
251
252fn byte_substring<T: ByteArrayType>(
253    array: &GenericByteArray<T>,
254    start: T::Offset,
255    length: Option<T::Offset>,
256) -> Result<ArrayRef, ArrowError>
257where
258    <T as ByteArrayType>::Native: PartialEq,
259{
260    let offsets = array.value_offsets();
261    let data = array.value_data();
262    let zero = <T::Offset as Zero>::zero();
263
264    // When array is [Large]StringArray, we will check whether `offset` is at a valid char boundary.
265    let check_char_boundary = {
266        |offset: T::Offset| {
267            if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) {
268                return Ok(offset);
269            }
270            // Safety: a StringArray must contain valid UTF8 data
271            let data_str = unsafe { std::str::from_utf8_unchecked(data) };
272            let offset_usize = offset.as_usize();
273            if data_str.is_char_boundary(offset_usize) {
274                Ok(offset)
275            } else {
276                Err(ArrowError::ComputeError(format!(
277                    "The offset {offset_usize} is at an invalid utf-8 boundary."
278                )))
279            }
280        }
281    };
282
283    // start and end offsets of all substrings
284    let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len());
285    let mut new_offsets: Vec<T::Offset> = Vec::with_capacity(array.len() + 1);
286    let mut len_so_far = zero;
287    new_offsets.push(zero);
288
289    offsets
290        .windows(2)
291        .try_for_each(|pair| -> Result<(), ArrowError> {
292            let new_start = match start.cmp(&zero) {
293                Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
294                Ordering::Equal => pair[0],
295                Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
296            };
297            let new_end = match length {
298                Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
299                None => pair[1],
300            };
301            len_so_far += new_end - new_start;
302            new_starts_ends.push((new_start, new_end));
303            new_offsets.push(len_so_far);
304            Ok(())
305        })?;
306
307    // concatenate substrings into a buffer
308    let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
309
310    new_starts_ends
311        .iter()
312        .map(|(start, end)| {
313            let start = start.as_usize();
314            let end = end.as_usize();
315            &data[start..end]
316        })
317        .for_each(|slice| new_values.extend_from_slice(slice));
318
319    let offsets = OffsetBuffer::new(new_offsets.into());
320    let values = new_values.into();
321    let nulls = array
322        .nulls()
323        .map(|n| n.inner().sliced())
324        .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
325    Ok(Arc::new(GenericByteArray::<T>::new(offsets, values, nulls)))
326}
327
328fn fixed_size_binary_substring(
329    array: &FixedSizeBinaryArray,
330    old_len: usize,
331    start: i64,
332    length: Option<u64>,
333) -> Result<ArrayRef, ArrowError> {
334    let new_start = match start.cmp(&0) {
335        Ordering::Greater => usize::try_from(start).unwrap_or(usize::MAX).min(old_len),
336        Ordering::Equal => 0,
337        Ordering::Less => {
338            let offset = usize::try_from(start.unsigned_abs()).unwrap_or(usize::MAX);
339            old_len.saturating_sub(offset)
340        }
341    };
342
343    let new_len = match length {
344        Some(len) => usize::try_from(len)
345            .unwrap_or(usize::MAX)
346            .min(old_len - new_start),
347        None => old_len - new_start,
348    };
349
350    // build value buffer
351    let num_of_elements = array.len();
352    let data = array.value_data();
353    let capacity = num_of_elements
354        .checked_mul(new_len)
355        .expect("capacity overflow");
356    let mut new_values = MutableBuffer::new(capacity);
357    (0..num_of_elements)
358        .map(|idx| {
359            let offset = idx * array.value_size();
360            (offset + new_start, offset + new_start + new_len)
361        })
362        .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
363
364    let mut nulls = array
365        .nulls()
366        .map(|n| n.inner().sliced())
367        .and_then(|b| NullBuffer::from_unsliced_buffer(b, num_of_elements));
368
369    if new_len == 0 && nulls.is_none() {
370        // FixedSizeBinaryArray::new takes length from the values buffer, except when size == 0.
371        // In that case it uses the null buffer length, so preserve the original length here.
372        // Example: ["", "", ""] -> substring(..., 1, Some(2)) should keep len=3;
373        // otherwise it collapses to an empty array (len=0).
374        nulls = Some(NullBuffer::new_valid(num_of_elements));
375    }
376
377    let new_len: i32 = new_len.try_into().expect("new_len overflow");
378
379    Ok(Arc::new(FixedSizeBinaryArray::new(
380        new_len,
381        new_values.into(),
382        nulls,
383    )))
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389    use arrow_buffer::BooleanBuffer;
390    use arrow_buffer::Buffer;
391
392    /// A helper macro to generate test cases.
393    /// # Arguments
394    /// * `input` - A vector which array can be built from.
395    /// * `start` - The start index of the substring.
396    /// * `len` - The length of the substring.
397    /// * `result` - The expected result of substring, which is a vector that array can be built from.
398    /// # Return
399    /// A vector of `(input, start, len, result)`.
400    ///
401    /// Users can provide any number of `(start, len, result)` to generate test cases for one `input`.
402    macro_rules! gen_test_cases {
403        ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => {
404            [
405                $(
406                    ($input.clone(), $start, $len, $result),
407                )*
408            ]
409        };
410    }
411
412    /// A helper macro to test the substring functions.
413    /// # Arguments
414    /// * `cases` - The test cases which is a vector of `(input, start, len, result)`.
415    ///   Please look at [`gen_test_cases`] to find how to generate it.
416    /// * `array_ty` - The array type.
417    /// * `substring_fn` - Either [`substring`] or [`substring_by_char`].
418    macro_rules! do_test {
419        ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
420            $cases
421                .into_iter()
422                .for_each(|(array, start, length, expected)| {
423                    let array = <$array_ty>::from(array);
424                    let result = $substring_fn(&array, start, length).unwrap();
425                    let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
426                    let expected = <$array_ty>::from(expected);
427                    assert_eq!(&expected, result);
428                })
429        };
430    }
431
432    /// A helper macro to test the substring functions for array types only implementing TryFrom.
433    macro_rules! do_test_tryfrom {
434        ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
435            $cases
436                .into_iter()
437                .for_each(|(array, start, length, expected)| {
438                    let array = <$array_ty>::try_from(array).unwrap();
439                    let result = $substring_fn(&array, start, length).unwrap();
440                    let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
441                    let expected = <$array_ty>::try_from(expected).unwrap();
442                    assert_eq!(&expected, result);
443                })
444        };
445    }
446
447    fn with_nulls_generic_binary<O: OffsetSizeTrait>() {
448        let input = vec![
449            Some("hello".as_bytes()),
450            None,
451            Some(&[0xf8, 0xf9, 0xff, 0xfa]),
452        ];
453        // all-nulls array is always identical
454        let base_case = gen_test_cases!(
455            vec![None, None, None],
456            (-1, Some(1), vec![None, None, None])
457        );
458        let cases = gen_test_cases!(
459            input,
460            // identity
461            (0, None, input.clone()),
462            // 0 length -> Nothing
463            (0, Some(0), vec![Some(&[]), None, Some(&[])]),
464            // high start -> Nothing
465            (1000, Some(0), vec![Some(&[]), None, Some(&[])]),
466            // high negative start -> identity
467            (-1000, None, input.clone()),
468            // high length -> identity
469            (0, Some(1000), input.clone())
470        );
471
472        do_test!(
473            [&base_case[..], &cases[..]].concat(),
474            GenericBinaryArray<O>,
475            substring
476        );
477    }
478
479    #[test]
480    fn with_nulls_binary() {
481        with_nulls_generic_binary::<i32>()
482    }
483
484    #[test]
485    fn with_nulls_large_binary() {
486        with_nulls_generic_binary::<i64>()
487    }
488
489    fn without_nulls_generic_binary<O: OffsetSizeTrait>() {
490        let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]];
491        // empty array is always identical
492        let base_case = gen_test_cases!(
493            vec!["".as_bytes(), b"", b""],
494            (2, Some(1), vec!["".as_bytes(), b"", b""])
495        );
496        let cases = gen_test_cases!(
497            input,
498            // identity
499            (0, None, input.clone()),
500            // increase start
501            (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
502            (2, None, vec![b"llo", b"", &[0xff, 0xfa]]),
503            (3, None, vec![b"lo", b"", &[0xfa]]),
504            (10, None, vec![b"", b"", b""]),
505            // increase start negatively
506            (-1, None, vec![b"o", b"", &[0xfa]]),
507            (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]),
508            (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
509            (-10, None, input.clone()),
510            // increase length
511            (1, Some(1), vec![b"e", b"", &[0xf9]]),
512            (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]),
513            (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]),
514            (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
515            (-3, Some(1), vec![b"l", b"", &[0xf9]]),
516            (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]),
517            (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
518            (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]])
519        );
520
521        do_test!(
522            [&base_case[..], &cases[..]].concat(),
523            GenericBinaryArray<O>,
524            substring
525        );
526    }
527
528    #[test]
529    fn without_nulls_binary() {
530        without_nulls_generic_binary::<i32>()
531    }
532
533    #[test]
534    fn without_nulls_large_binary() {
535        without_nulls_generic_binary::<i64>()
536    }
537
538    fn generic_binary_with_non_zero_offset<O: OffsetSizeTrait>() {
539        let values = 0_u8..15;
540        let offsets = &[
541            O::zero(),
542            O::from_usize(5).unwrap(),
543            O::from_usize(10).unwrap(),
544            O::from_usize(15).unwrap(),
545        ];
546        // set the first and third element to be valid
547        let bitmap = [0b101_u8];
548
549        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
550        let values = Buffer::from_iter(values);
551        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
552            Buffer::from(bitmap),
553            0,
554            3,
555        )));
556        // array is `[null, [10, 11, 12, 13, 14]]`
557        let array = GenericBinaryArray::<O>::new(offsets, values, nulls).slice(1, 2);
558        // result is `[null, [11, 12, 13, 14]]`
559        let result = substring(&array, 1, None).unwrap();
560        let result = result
561            .as_any()
562            .downcast_ref::<GenericBinaryArray<O>>()
563            .unwrap();
564        let expected =
565            GenericBinaryArray::<O>::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]);
566        assert_eq!(result, &expected);
567    }
568
569    #[test]
570    fn binary_with_non_zero_offset() {
571        generic_binary_with_non_zero_offset::<i32>()
572    }
573
574    #[test]
575    fn large_binary_with_non_zero_offset() {
576        generic_binary_with_non_zero_offset::<i64>()
577    }
578
579    #[test]
580    fn with_nulls_fixed_size_binary() {
581        let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])];
582        // all-nulls array is always identical
583        let base_case =
584            gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None]));
585        let cases = gen_test_cases!(
586            input,
587            // identity
588            (0, None, input.clone()),
589            // increase start
590            (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
591            (2, None, vec![Some(b"t"), None, Some(&[0xff])]),
592            (3, None, vec![Some(b""), None, Some(b"")]),
593            (10, None, vec![Some(b""), None, Some(b"")]),
594            // increase start negatively
595            (-1, None, vec![Some(b"t"), None, Some(&[0xff])]),
596            (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
597            (-3, None, input.clone()),
598            (-10, None, input.clone()),
599            // increase length
600            (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]),
601            (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
602            (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
603            (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]),
604            (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]),
605            (-3, Some(3), input.clone()),
606            (-3, Some(4), input.clone())
607        );
608
609        do_test_tryfrom!(
610            [&base_case[..], &cases[..]].concat(),
611            FixedSizeBinaryArray,
612            substring
613        );
614    }
615
616    #[test]
617    fn without_nulls_fixed_size_binary() {
618        let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]];
619        // empty array is always identical
620        let base_case = gen_test_cases!(
621            vec!["".as_bytes(), &[], &[]],
622            (1, Some(2), vec!["".as_bytes(), &[], &[]])
623        );
624        let cases = gen_test_cases!(
625            input,
626            // identity
627            (0, None, input.clone()),
628            // increase start
629            (1, None, vec![b"at", b"og", &[0xf9, 0xff]]),
630            (2, None, vec![b"t", b"g", &[0xff]]),
631            (3, None, vec![&[], &[], &[]]),
632            (10, None, vec![&[], &[], &[]]),
633            // increase start negatively
634            (-1, None, vec![b"t", b"g", &[0xff]]),
635            (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]),
636            (-3, None, input.clone()),
637            (-10, None, input.clone()),
638            // increase length
639            (1, Some(1), vec![b"a", b"o", &[0xf9]]),
640            (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]),
641            (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]),
642            (-3, Some(1), vec![b"c", b"d", &[0xf8]]),
643            (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]),
644            (-3, Some(3), input.clone()),
645            (-3, Some(4), input.clone())
646        );
647
648        do_test_tryfrom!(
649            [&base_case[..], &cases[..]].concat(),
650            FixedSizeBinaryArray,
651            substring
652        );
653    }
654
655    #[test]
656    fn fixed_size_binary_with_non_zero_offset() {
657        let values: [u8; 15] = *b"hellotherearrow";
658        // set the first and third element to be valid
659        let bits_v = [0b101_u8];
660
661        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
662            Buffer::from(bits_v),
663            0,
664            3,
665        )));
666        // array is `[null, "arrow"]`
667        let array = FixedSizeBinaryArray::new(5, Buffer::from(&values), nulls).slice(1, 2);
668        // result is `[null, "rrow"]`
669        let result = substring(&array, 1, None).unwrap();
670        let result = result
671            .as_any()
672            .downcast_ref::<FixedSizeBinaryArray>()
673            .unwrap();
674        let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
675            vec![None, Some(b"rrow")].into_iter(),
676            4,
677        )
678        .unwrap();
679        assert_eq!(result, &expected);
680    }
681
682    fn with_nulls_generic_string<O: OffsetSizeTrait>() {
683        let input = vec![Some("hello"), None, Some("word")];
684        // all-nulls array is always identical
685        let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
686        let cases = gen_test_cases!(
687            input,
688            // identity
689            (0, None, input.clone()),
690            // 0 length -> Nothing
691            (0, Some(0), vec![Some(""), None, Some("")]),
692            // high start -> Nothing
693            (1000, Some(0), vec![Some(""), None, Some("")]),
694            // high negative start -> identity
695            (-1000, None, input.clone()),
696            // high length -> identity
697            (0, Some(1000), input.clone())
698        );
699
700        do_test!(
701            [&base_case[..], &cases[..]].concat(),
702            GenericStringArray<O>,
703            substring
704        );
705    }
706
707    #[test]
708    fn with_nulls_string() {
709        with_nulls_generic_string::<i32>()
710    }
711
712    #[test]
713    fn with_nulls_large_string() {
714        with_nulls_generic_string::<i64>()
715    }
716
717    fn without_nulls_generic_string<O: OffsetSizeTrait>() {
718        let input = vec!["hello", "", "word"];
719        // empty array is always identical
720        let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
721        let cases = gen_test_cases!(
722            input,
723            // identity
724            (0, None, input.clone()),
725            (1, None, vec!["ello", "", "ord"]),
726            (2, None, vec!["llo", "", "rd"]),
727            (3, None, vec!["lo", "", "d"]),
728            (10, None, vec!["", "", ""]),
729            // increase start negatively
730            (-1, None, vec!["o", "", "d"]),
731            (-2, None, vec!["lo", "", "rd"]),
732            (-3, None, vec!["llo", "", "ord"]),
733            (-10, None, input.clone()),
734            // increase length
735            (1, Some(1), vec!["e", "", "o"]),
736            (1, Some(2), vec!["el", "", "or"]),
737            (1, Some(3), vec!["ell", "", "ord"]),
738            (1, Some(4), vec!["ello", "", "ord"]),
739            (-3, Some(1), vec!["l", "", "o"]),
740            (-3, Some(2), vec!["ll", "", "or"]),
741            (-3, Some(3), vec!["llo", "", "ord"]),
742            (-3, Some(4), vec!["llo", "", "ord"])
743        );
744
745        do_test!(
746            [&base_case[..], &cases[..]].concat(),
747            GenericStringArray<O>,
748            substring
749        );
750    }
751
752    #[test]
753    fn without_nulls_string() {
754        without_nulls_generic_string::<i32>()
755    }
756
757    #[test]
758    fn without_nulls_large_string() {
759        without_nulls_generic_string::<i64>()
760    }
761
762    fn generic_string_with_non_zero_offset<O: OffsetSizeTrait>() {
763        let values = b"hellotherearrow";
764        let offsets = &[
765            O::zero(),
766            O::from_usize(5).unwrap(),
767            O::from_usize(10).unwrap(),
768            O::from_usize(15).unwrap(),
769        ];
770        // set the first and third element to be valid
771        let bitmap = [0b101_u8];
772
773        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
774        let values = Buffer::from(values);
775        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
776            Buffer::from(bitmap),
777            0,
778            3,
779        )));
780        // array is `[null, "arrow"]`
781        let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
782        // result is `[null, "rrow"]`
783        let result = substring(&array, 1, None).unwrap();
784        let result = result
785            .as_any()
786            .downcast_ref::<GenericStringArray<O>>()
787            .unwrap();
788        let expected = GenericStringArray::<O>::from(vec![None, Some("rrow")]);
789        assert_eq!(result, &expected);
790    }
791
792    #[test]
793    fn string_with_non_zero_offset() {
794        generic_string_with_non_zero_offset::<i32>()
795    }
796
797    #[test]
798    fn large_string_with_non_zero_offset() {
799        generic_string_with_non_zero_offset::<i64>()
800    }
801
802    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
803        let input = vec![Some("hello"), None, Some("Γ ⊢x:T")];
804        // all-nulls array is always identical
805        let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
806        let cases = gen_test_cases!(
807            input,
808            // identity
809            (0, None, input.clone()),
810            // 0 length -> Nothing
811            (0, Some(0), vec![Some(""), None, Some("")]),
812            // high start -> Nothing
813            (1000, Some(0), vec![Some(""), None, Some("")]),
814            // high negative start -> identity
815            (-1000, None, input.clone()),
816            // high length -> identity
817            (0, Some(1000), input.clone())
818        );
819
820        do_test!(
821            [&base_case[..], &cases[..]].concat(),
822            GenericStringArray<O>,
823            substring_by_char
824        );
825    }
826
827    #[test]
828    fn with_nulls_string_by_char() {
829        with_nulls_generic_string_by_char::<i32>()
830    }
831
832    #[test]
833    fn with_nulls_large_string_by_char() {
834        with_nulls_generic_string_by_char::<i64>()
835    }
836
837    fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
838        let input = vec!["hello", "", "Γ ⊢x:T"];
839        // empty array is always identical
840        let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
841        let cases = gen_test_cases!(
842            input,
843            //identity
844            (0, None, input.clone()),
845            // increase start
846            (1, None, vec!["ello", "", " ⊢x:T"]),
847            (2, None, vec!["llo", "", "⊢x:T"]),
848            (3, None, vec!["lo", "", "x:T"]),
849            (10, None, vec!["", "", ""]),
850            // increase start negatively
851            (-1, None, vec!["o", "", "T"]),
852            (-2, None, vec!["lo", "", ":T"]),
853            (-4, None, vec!["ello", "", "⊢x:T"]),
854            (-10, None, input.clone()),
855            // increase length
856            (1, Some(1), vec!["e", "", " "]),
857            (1, Some(2), vec!["el", "", " ⊢"]),
858            (1, Some(3), vec!["ell", "", " ⊢x"]),
859            (1, Some(6), vec!["ello", "", " ⊢x:T"]),
860            (-4, Some(1), vec!["e", "", "⊢"]),
861            (-4, Some(2), vec!["el", "", "⊢x"]),
862            (-4, Some(3), vec!["ell", "", "⊢x:"]),
863            (-4, Some(4), vec!["ello", "", "⊢x:T"])
864        );
865
866        do_test!(
867            [&base_case[..], &cases[..]].concat(),
868            GenericStringArray<O>,
869            substring_by_char
870        );
871    }
872
873    #[test]
874    fn without_nulls_string_by_char() {
875        without_nulls_generic_string_by_char::<i32>()
876    }
877
878    #[test]
879    fn without_nulls_large_string_by_char() {
880        without_nulls_generic_string_by_char::<i64>()
881    }
882
883    fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() {
884        let values = "S→T = Πx:S.T";
885        let offsets = &[
886            O::zero(),
887            O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(),
888            O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(),
889            O::from_usize(values.len()).unwrap(),
890        ];
891        // set the first and third element to be valid
892        let bitmap = [0b101_u8];
893
894        let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
895        let values = Buffer::from(values.as_bytes());
896        let nulls = Some(NullBuffer::new(BooleanBuffer::new(
897            Buffer::from(bitmap),
898            0,
899            3,
900        )));
901        // array is `[null, "Πx:S.T"]`
902        let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
903        // result is `[null, "x:S.T"]`
904        let result = substring_by_char(&array, 1, None).unwrap();
905        let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
906        assert_eq!(result, expected);
907    }
908
909    #[test]
910    fn string_with_non_zero_offset_by_char() {
911        generic_string_by_char_with_non_zero_offset::<i32>()
912    }
913
914    #[test]
915    fn large_string_with_non_zero_offset_by_char() {
916        generic_string_by_char_with_non_zero_offset::<i64>()
917    }
918
919    #[test]
920    fn dictionary() {
921        _dictionary::<Int8Type>();
922        _dictionary::<Int16Type>();
923        _dictionary::<Int32Type>();
924        _dictionary::<Int64Type>();
925        _dictionary::<UInt8Type>();
926        _dictionary::<UInt16Type>();
927        _dictionary::<UInt32Type>();
928        _dictionary::<UInt64Type>();
929    }
930
931    fn _dictionary<K: ArrowDictionaryKeyType>() {
932        const TOTAL: i32 = 100;
933
934        let v = ["aaa", "bbb", "ccc", "ddd", "eee"];
935        let data: Vec<Option<&str>> = (0..TOTAL)
936            .map(|n| {
937                let i = n % 5;
938                if i == 3 { None } else { Some(v[i as usize]) }
939            })
940            .collect();
941
942        let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
943
944        let expected: Vec<Option<&str>> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect();
945
946        let res = substring(&dict_array, 1, Some(2)).unwrap();
947        let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
948        let actual: Vec<Option<&str>> = actual
949            .values()
950            .as_any()
951            .downcast_ref::<GenericStringArray<i32>>()
952            .unwrap()
953            .take_iter(actual.keys_iter())
954            .collect();
955
956        for i in 0..TOTAL as usize {
957            assert_eq!(expected[i], actual[i],);
958        }
959    }
960
961    #[test]
962    fn check_invalid_array_type() {
963        let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
964        let err = substring(&array, 0, None).unwrap_err().to_string();
965        assert!(err.contains("substring does not support type"));
966    }
967
968    // tests for the utf-8 validation checking
969    #[test]
970    fn check_start_index() {
971        let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
972        let err = substring(&array, -1, None).unwrap_err().to_string();
973        assert!(err.contains("invalid utf-8 boundary"));
974    }
975
976    #[test]
977    fn check_length() {
978        let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
979        let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
980        assert!(err.contains("invalid utf-8 boundary"));
981    }
982
983    #[test]
984    fn non_utf8_bytes() {
985        // non-utf8 bytes
986        let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD];
987        let array = BinaryArray::from(vec![Some(bytes)]);
988        let arr = substring(&array, 0, Some(5)).unwrap();
989        let actual = arr.as_any().downcast_ref::<BinaryArray>().unwrap();
990
991        let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5];
992        let expected = BinaryArray::from(vec![Some(expected_bytes)]);
993        assert_eq!(expected, *actual);
994    }
995}