1use arrow_array::builder::BufferBuilder;
23use arrow_array::types::*;
24use arrow_array::*;
25use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBuffer};
26use arrow_schema::{ArrowError, DataType};
27use num_traits::Zero;
28use std::cmp::Ordering;
29use std::sync::Arc;
30
31pub fn substring(
72 array: &dyn Array,
73 start: i64,
74 length: Option<u64>,
75) -> Result<ArrayRef, ArrowError> {
76 macro_rules! substring_dict {
77 ($kt: ident, $($t: ident: $gt: ident), *) => {
78 match $kt.as_ref() {
79 $(
80 &DataType::$t => {
81 let dict = array
82 .as_any()
83 .downcast_ref::<DictionaryArray<$gt>>()
84 .unwrap_or_else(|| {
85 panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
86 stringify!($gt), array.data_type())
87 });
88 let values = substring(dict.values(), start, length)?;
89 Ok(Arc::new(dict.with_values(values)))
90 },
91 )*
92 t => panic!("Unsupported dictionary key type: {}", t)
93 }
94 }
95 }
96
97 match array.data_type() {
98 DataType::Dictionary(kt, _) => {
99 substring_dict!(
100 kt,
101 Int8: Int8Type,
102 Int16: Int16Type,
103 Int32: Int32Type,
104 Int64: Int64Type,
105 UInt8: UInt8Type,
106 UInt16: UInt16Type,
107 UInt32: UInt32Type,
108 UInt64: UInt64Type
109 )
110 }
111 DataType::LargeBinary => byte_substring(
112 array
113 .as_any()
114 .downcast_ref::<LargeBinaryArray>()
115 .expect("A large binary is expected"),
116 start,
117 length.map(|e| e as i64),
118 ),
119 DataType::Binary => byte_substring(
120 array
121 .as_any()
122 .downcast_ref::<BinaryArray>()
123 .expect("A binary is expected"),
124 start as i32,
125 length.map(|e| e as i32),
126 ),
127 DataType::FixedSizeBinary(old_len) => {
128 let old_len: usize = (*old_len)
129 .try_into()
130 .expect("negative FixedSizeBinary value length");
131 fixed_size_binary_substring(
132 array
133 .as_any()
134 .downcast_ref::<FixedSizeBinaryArray>()
135 .expect("a fixed size binary is expected"),
136 old_len,
137 start,
138 length,
139 )
140 }
141 DataType::LargeUtf8 => byte_substring(
142 array
143 .as_any()
144 .downcast_ref::<LargeStringArray>()
145 .expect("A large string is expected"),
146 start,
147 length.map(|e| e as i64),
148 ),
149 DataType::Utf8 => byte_substring(
150 array
151 .as_any()
152 .downcast_ref::<StringArray>()
153 .expect("A string is expected"),
154 start as i32,
155 length.map(|e| e as i32),
156 ),
157 _ => Err(ArrowError::ComputeError(format!(
158 "substring does not support type {:?}",
159 array.data_type()
160 ))),
161 }
162}
163
164pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
193 array: &GenericStringArray<OffsetSize>,
194 start: i64,
195 length: Option<u64>,
196) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
197 let mut vals = BufferBuilder::<u8>::new({
198 let offsets = array.value_offsets();
199 (offsets[array.len()] - offsets[0]).to_usize().unwrap()
200 });
201 let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
202 new_offsets.append(OffsetSize::zero());
203 let length = length.map(|len| len.to_usize().unwrap());
204
205 array.iter().for_each(|val| {
206 if let Some(val) = val {
207 let char_count = val.chars().count();
208 let start = if start >= 0 {
209 start.to_usize().unwrap()
210 } else {
211 char_count - (-start).to_usize().unwrap().min(char_count)
212 };
213 let (start_offset, end_offset) = get_start_end_offset(val, start, length);
214 vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
215 }
216 new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
217 });
218 let offsets = OffsetBuffer::new(new_offsets.finish().into());
219 let values = vals.finish();
220 let nulls = array
221 .nulls()
222 .map(|n| n.inner().sliced())
223 .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
224 Ok(GenericStringArray::<OffsetSize>::new(
225 offsets, values, nulls,
226 ))
227}
228
229fn get_start_end_offset(val: &str, start: usize, length: Option<usize>) -> (usize, usize) {
235 let len = val.len();
236 let mut offset_char_iter = val.char_indices();
237 let start_offset = offset_char_iter
238 .nth(start)
239 .map_or(len, |(offset, _)| offset);
240 let end_offset = length.map_or(len, |length| {
241 if length > 0 {
242 offset_char_iter
243 .nth(length - 1)
244 .map_or(len, |(offset, _)| offset)
245 } else {
246 start_offset
247 }
248 });
249 (start_offset, end_offset)
250}
251
252fn byte_substring<T: ByteArrayType>(
253 array: &GenericByteArray<T>,
254 start: T::Offset,
255 length: Option<T::Offset>,
256) -> Result<ArrayRef, ArrowError>
257where
258 <T as ByteArrayType>::Native: PartialEq,
259{
260 let offsets = array.value_offsets();
261 let data = array.value_data();
262 let zero = <T::Offset as Zero>::zero();
263
264 let check_char_boundary = {
266 |offset: T::Offset| {
267 if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) {
268 return Ok(offset);
269 }
270 let data_str = unsafe { std::str::from_utf8_unchecked(data) };
272 let offset_usize = offset.as_usize();
273 if data_str.is_char_boundary(offset_usize) {
274 Ok(offset)
275 } else {
276 Err(ArrowError::ComputeError(format!(
277 "The offset {offset_usize} is at an invalid utf-8 boundary."
278 )))
279 }
280 }
281 };
282
283 let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len());
285 let mut new_offsets: Vec<T::Offset> = Vec::with_capacity(array.len() + 1);
286 let mut len_so_far = zero;
287 new_offsets.push(zero);
288
289 offsets
290 .windows(2)
291 .try_for_each(|pair| -> Result<(), ArrowError> {
292 let new_start = match start.cmp(&zero) {
293 Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
294 Ordering::Equal => pair[0],
295 Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
296 };
297 let new_end = match length {
298 Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
299 None => pair[1],
300 };
301 len_so_far += new_end - new_start;
302 new_starts_ends.push((new_start, new_end));
303 new_offsets.push(len_so_far);
304 Ok(())
305 })?;
306
307 let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
309
310 new_starts_ends
311 .iter()
312 .map(|(start, end)| {
313 let start = start.as_usize();
314 let end = end.as_usize();
315 &data[start..end]
316 })
317 .for_each(|slice| new_values.extend_from_slice(slice));
318
319 let offsets = OffsetBuffer::new(new_offsets.into());
320 let values = new_values.into();
321 let nulls = array
322 .nulls()
323 .map(|n| n.inner().sliced())
324 .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
325 Ok(Arc::new(GenericByteArray::<T>::new(offsets, values, nulls)))
326}
327
328fn fixed_size_binary_substring(
329 array: &FixedSizeBinaryArray,
330 old_len: usize,
331 start: i64,
332 length: Option<u64>,
333) -> Result<ArrayRef, ArrowError> {
334 let new_start = match start.cmp(&0) {
335 Ordering::Greater => usize::try_from(start).unwrap_or(usize::MAX).min(old_len),
336 Ordering::Equal => 0,
337 Ordering::Less => {
338 let offset = usize::try_from(start.unsigned_abs()).unwrap_or(usize::MAX);
339 old_len.saturating_sub(offset)
340 }
341 };
342
343 let new_len = match length {
344 Some(len) => usize::try_from(len)
345 .unwrap_or(usize::MAX)
346 .min(old_len - new_start),
347 None => old_len - new_start,
348 };
349
350 let num_of_elements = array.len();
352 let data = array.value_data();
353 let capacity = num_of_elements
354 .checked_mul(new_len)
355 .expect("capacity overflow");
356 let mut new_values = MutableBuffer::new(capacity);
357 (0..num_of_elements)
358 .map(|idx| {
359 let offset = idx * array.value_size();
360 (offset + new_start, offset + new_start + new_len)
361 })
362 .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
363
364 let mut nulls = array
365 .nulls()
366 .map(|n| n.inner().sliced())
367 .and_then(|b| NullBuffer::from_unsliced_buffer(b, num_of_elements));
368
369 if new_len == 0 && nulls.is_none() {
370 nulls = Some(NullBuffer::new_valid(num_of_elements));
375 }
376
377 let new_len: i32 = new_len.try_into().expect("new_len overflow");
378
379 Ok(Arc::new(FixedSizeBinaryArray::new(
380 new_len,
381 new_values.into(),
382 nulls,
383 )))
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389 use arrow_buffer::BooleanBuffer;
390 use arrow_buffer::Buffer;
391
392 macro_rules! gen_test_cases {
403 ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => {
404 [
405 $(
406 ($input.clone(), $start, $len, $result),
407 )*
408 ]
409 };
410 }
411
412 macro_rules! do_test {
419 ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
420 $cases
421 .into_iter()
422 .for_each(|(array, start, length, expected)| {
423 let array = <$array_ty>::from(array);
424 let result = $substring_fn(&array, start, length).unwrap();
425 let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
426 let expected = <$array_ty>::from(expected);
427 assert_eq!(&expected, result);
428 })
429 };
430 }
431
432 macro_rules! do_test_tryfrom {
434 ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
435 $cases
436 .into_iter()
437 .for_each(|(array, start, length, expected)| {
438 let array = <$array_ty>::try_from(array).unwrap();
439 let result = $substring_fn(&array, start, length).unwrap();
440 let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
441 let expected = <$array_ty>::try_from(expected).unwrap();
442 assert_eq!(&expected, result);
443 })
444 };
445 }
446
447 fn with_nulls_generic_binary<O: OffsetSizeTrait>() {
448 let input = vec![
449 Some("hello".as_bytes()),
450 None,
451 Some(&[0xf8, 0xf9, 0xff, 0xfa]),
452 ];
453 let base_case = gen_test_cases!(
455 vec![None, None, None],
456 (-1, Some(1), vec![None, None, None])
457 );
458 let cases = gen_test_cases!(
459 input,
460 (0, None, input.clone()),
462 (0, Some(0), vec![Some(&[]), None, Some(&[])]),
464 (1000, Some(0), vec![Some(&[]), None, Some(&[])]),
466 (-1000, None, input.clone()),
468 (0, Some(1000), input.clone())
470 );
471
472 do_test!(
473 [&base_case[..], &cases[..]].concat(),
474 GenericBinaryArray<O>,
475 substring
476 );
477 }
478
479 #[test]
480 fn with_nulls_binary() {
481 with_nulls_generic_binary::<i32>()
482 }
483
484 #[test]
485 fn with_nulls_large_binary() {
486 with_nulls_generic_binary::<i64>()
487 }
488
489 fn without_nulls_generic_binary<O: OffsetSizeTrait>() {
490 let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]];
491 let base_case = gen_test_cases!(
493 vec!["".as_bytes(), b"", b""],
494 (2, Some(1), vec!["".as_bytes(), b"", b""])
495 );
496 let cases = gen_test_cases!(
497 input,
498 (0, None, input.clone()),
500 (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
502 (2, None, vec![b"llo", b"", &[0xff, 0xfa]]),
503 (3, None, vec![b"lo", b"", &[0xfa]]),
504 (10, None, vec![b"", b"", b""]),
505 (-1, None, vec![b"o", b"", &[0xfa]]),
507 (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]),
508 (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
509 (-10, None, input.clone()),
510 (1, Some(1), vec![b"e", b"", &[0xf9]]),
512 (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]),
513 (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]),
514 (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
515 (-3, Some(1), vec![b"l", b"", &[0xf9]]),
516 (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]),
517 (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
518 (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]])
519 );
520
521 do_test!(
522 [&base_case[..], &cases[..]].concat(),
523 GenericBinaryArray<O>,
524 substring
525 );
526 }
527
528 #[test]
529 fn without_nulls_binary() {
530 without_nulls_generic_binary::<i32>()
531 }
532
533 #[test]
534 fn without_nulls_large_binary() {
535 without_nulls_generic_binary::<i64>()
536 }
537
538 fn generic_binary_with_non_zero_offset<O: OffsetSizeTrait>() {
539 let values = 0_u8..15;
540 let offsets = &[
541 O::zero(),
542 O::from_usize(5).unwrap(),
543 O::from_usize(10).unwrap(),
544 O::from_usize(15).unwrap(),
545 ];
546 let bitmap = [0b101_u8];
548
549 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
550 let values = Buffer::from_iter(values);
551 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
552 Buffer::from(bitmap),
553 0,
554 3,
555 )));
556 let array = GenericBinaryArray::<O>::new(offsets, values, nulls).slice(1, 2);
558 let result = substring(&array, 1, None).unwrap();
560 let result = result
561 .as_any()
562 .downcast_ref::<GenericBinaryArray<O>>()
563 .unwrap();
564 let expected =
565 GenericBinaryArray::<O>::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]);
566 assert_eq!(result, &expected);
567 }
568
569 #[test]
570 fn binary_with_non_zero_offset() {
571 generic_binary_with_non_zero_offset::<i32>()
572 }
573
574 #[test]
575 fn large_binary_with_non_zero_offset() {
576 generic_binary_with_non_zero_offset::<i64>()
577 }
578
579 #[test]
580 fn with_nulls_fixed_size_binary() {
581 let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])];
582 let base_case =
584 gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None]));
585 let cases = gen_test_cases!(
586 input,
587 (0, None, input.clone()),
589 (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
591 (2, None, vec![Some(b"t"), None, Some(&[0xff])]),
592 (3, None, vec![Some(b""), None, Some(b"")]),
593 (10, None, vec![Some(b""), None, Some(b"")]),
594 (-1, None, vec![Some(b"t"), None, Some(&[0xff])]),
596 (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
597 (-3, None, input.clone()),
598 (-10, None, input.clone()),
599 (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]),
601 (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
602 (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
603 (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]),
604 (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]),
605 (-3, Some(3), input.clone()),
606 (-3, Some(4), input.clone())
607 );
608
609 do_test_tryfrom!(
610 [&base_case[..], &cases[..]].concat(),
611 FixedSizeBinaryArray,
612 substring
613 );
614 }
615
616 #[test]
617 fn without_nulls_fixed_size_binary() {
618 let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]];
619 let base_case = gen_test_cases!(
621 vec!["".as_bytes(), &[], &[]],
622 (1, Some(2), vec!["".as_bytes(), &[], &[]])
623 );
624 let cases = gen_test_cases!(
625 input,
626 (0, None, input.clone()),
628 (1, None, vec![b"at", b"og", &[0xf9, 0xff]]),
630 (2, None, vec![b"t", b"g", &[0xff]]),
631 (3, None, vec![&[], &[], &[]]),
632 (10, None, vec![&[], &[], &[]]),
633 (-1, None, vec![b"t", b"g", &[0xff]]),
635 (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]),
636 (-3, None, input.clone()),
637 (-10, None, input.clone()),
638 (1, Some(1), vec![b"a", b"o", &[0xf9]]),
640 (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]),
641 (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]),
642 (-3, Some(1), vec![b"c", b"d", &[0xf8]]),
643 (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]),
644 (-3, Some(3), input.clone()),
645 (-3, Some(4), input.clone())
646 );
647
648 do_test_tryfrom!(
649 [&base_case[..], &cases[..]].concat(),
650 FixedSizeBinaryArray,
651 substring
652 );
653 }
654
655 #[test]
656 fn fixed_size_binary_with_non_zero_offset() {
657 let values: [u8; 15] = *b"hellotherearrow";
658 let bits_v = [0b101_u8];
660
661 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
662 Buffer::from(bits_v),
663 0,
664 3,
665 )));
666 let array = FixedSizeBinaryArray::new(5, Buffer::from(&values), nulls).slice(1, 2);
668 let result = substring(&array, 1, None).unwrap();
670 let result = result
671 .as_any()
672 .downcast_ref::<FixedSizeBinaryArray>()
673 .unwrap();
674 let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
675 vec![None, Some(b"rrow")].into_iter(),
676 4,
677 )
678 .unwrap();
679 assert_eq!(result, &expected);
680 }
681
682 fn with_nulls_generic_string<O: OffsetSizeTrait>() {
683 let input = vec![Some("hello"), None, Some("word")];
684 let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
686 let cases = gen_test_cases!(
687 input,
688 (0, None, input.clone()),
690 (0, Some(0), vec![Some(""), None, Some("")]),
692 (1000, Some(0), vec![Some(""), None, Some("")]),
694 (-1000, None, input.clone()),
696 (0, Some(1000), input.clone())
698 );
699
700 do_test!(
701 [&base_case[..], &cases[..]].concat(),
702 GenericStringArray<O>,
703 substring
704 );
705 }
706
707 #[test]
708 fn with_nulls_string() {
709 with_nulls_generic_string::<i32>()
710 }
711
712 #[test]
713 fn with_nulls_large_string() {
714 with_nulls_generic_string::<i64>()
715 }
716
717 fn without_nulls_generic_string<O: OffsetSizeTrait>() {
718 let input = vec!["hello", "", "word"];
719 let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
721 let cases = gen_test_cases!(
722 input,
723 (0, None, input.clone()),
725 (1, None, vec!["ello", "", "ord"]),
726 (2, None, vec!["llo", "", "rd"]),
727 (3, None, vec!["lo", "", "d"]),
728 (10, None, vec!["", "", ""]),
729 (-1, None, vec!["o", "", "d"]),
731 (-2, None, vec!["lo", "", "rd"]),
732 (-3, None, vec!["llo", "", "ord"]),
733 (-10, None, input.clone()),
734 (1, Some(1), vec!["e", "", "o"]),
736 (1, Some(2), vec!["el", "", "or"]),
737 (1, Some(3), vec!["ell", "", "ord"]),
738 (1, Some(4), vec!["ello", "", "ord"]),
739 (-3, Some(1), vec!["l", "", "o"]),
740 (-3, Some(2), vec!["ll", "", "or"]),
741 (-3, Some(3), vec!["llo", "", "ord"]),
742 (-3, Some(4), vec!["llo", "", "ord"])
743 );
744
745 do_test!(
746 [&base_case[..], &cases[..]].concat(),
747 GenericStringArray<O>,
748 substring
749 );
750 }
751
752 #[test]
753 fn without_nulls_string() {
754 without_nulls_generic_string::<i32>()
755 }
756
757 #[test]
758 fn without_nulls_large_string() {
759 without_nulls_generic_string::<i64>()
760 }
761
762 fn generic_string_with_non_zero_offset<O: OffsetSizeTrait>() {
763 let values = b"hellotherearrow";
764 let offsets = &[
765 O::zero(),
766 O::from_usize(5).unwrap(),
767 O::from_usize(10).unwrap(),
768 O::from_usize(15).unwrap(),
769 ];
770 let bitmap = [0b101_u8];
772
773 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
774 let values = Buffer::from(values);
775 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
776 Buffer::from(bitmap),
777 0,
778 3,
779 )));
780 let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
782 let result = substring(&array, 1, None).unwrap();
784 let result = result
785 .as_any()
786 .downcast_ref::<GenericStringArray<O>>()
787 .unwrap();
788 let expected = GenericStringArray::<O>::from(vec![None, Some("rrow")]);
789 assert_eq!(result, &expected);
790 }
791
792 #[test]
793 fn string_with_non_zero_offset() {
794 generic_string_with_non_zero_offset::<i32>()
795 }
796
797 #[test]
798 fn large_string_with_non_zero_offset() {
799 generic_string_with_non_zero_offset::<i64>()
800 }
801
802 fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
803 let input = vec![Some("hello"), None, Some("Γ ⊢x:T")];
804 let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
806 let cases = gen_test_cases!(
807 input,
808 (0, None, input.clone()),
810 (0, Some(0), vec![Some(""), None, Some("")]),
812 (1000, Some(0), vec![Some(""), None, Some("")]),
814 (-1000, None, input.clone()),
816 (0, Some(1000), input.clone())
818 );
819
820 do_test!(
821 [&base_case[..], &cases[..]].concat(),
822 GenericStringArray<O>,
823 substring_by_char
824 );
825 }
826
827 #[test]
828 fn with_nulls_string_by_char() {
829 with_nulls_generic_string_by_char::<i32>()
830 }
831
832 #[test]
833 fn with_nulls_large_string_by_char() {
834 with_nulls_generic_string_by_char::<i64>()
835 }
836
837 fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
838 let input = vec!["hello", "", "Γ ⊢x:T"];
839 let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
841 let cases = gen_test_cases!(
842 input,
843 (0, None, input.clone()),
845 (1, None, vec!["ello", "", " ⊢x:T"]),
847 (2, None, vec!["llo", "", "⊢x:T"]),
848 (3, None, vec!["lo", "", "x:T"]),
849 (10, None, vec!["", "", ""]),
850 (-1, None, vec!["o", "", "T"]),
852 (-2, None, vec!["lo", "", ":T"]),
853 (-4, None, vec!["ello", "", "⊢x:T"]),
854 (-10, None, input.clone()),
855 (1, Some(1), vec!["e", "", " "]),
857 (1, Some(2), vec!["el", "", " ⊢"]),
858 (1, Some(3), vec!["ell", "", " ⊢x"]),
859 (1, Some(6), vec!["ello", "", " ⊢x:T"]),
860 (-4, Some(1), vec!["e", "", "⊢"]),
861 (-4, Some(2), vec!["el", "", "⊢x"]),
862 (-4, Some(3), vec!["ell", "", "⊢x:"]),
863 (-4, Some(4), vec!["ello", "", "⊢x:T"])
864 );
865
866 do_test!(
867 [&base_case[..], &cases[..]].concat(),
868 GenericStringArray<O>,
869 substring_by_char
870 );
871 }
872
873 #[test]
874 fn without_nulls_string_by_char() {
875 without_nulls_generic_string_by_char::<i32>()
876 }
877
878 #[test]
879 fn without_nulls_large_string_by_char() {
880 without_nulls_generic_string_by_char::<i64>()
881 }
882
883 fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() {
884 let values = "S→T = Πx:S.T";
885 let offsets = &[
886 O::zero(),
887 O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(),
888 O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(),
889 O::from_usize(values.len()).unwrap(),
890 ];
891 let bitmap = [0b101_u8];
893
894 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
895 let values = Buffer::from(values.as_bytes());
896 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
897 Buffer::from(bitmap),
898 0,
899 3,
900 )));
901 let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
903 let result = substring_by_char(&array, 1, None).unwrap();
905 let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
906 assert_eq!(result, expected);
907 }
908
909 #[test]
910 fn string_with_non_zero_offset_by_char() {
911 generic_string_by_char_with_non_zero_offset::<i32>()
912 }
913
914 #[test]
915 fn large_string_with_non_zero_offset_by_char() {
916 generic_string_by_char_with_non_zero_offset::<i64>()
917 }
918
919 #[test]
920 fn dictionary() {
921 _dictionary::<Int8Type>();
922 _dictionary::<Int16Type>();
923 _dictionary::<Int32Type>();
924 _dictionary::<Int64Type>();
925 _dictionary::<UInt8Type>();
926 _dictionary::<UInt16Type>();
927 _dictionary::<UInt32Type>();
928 _dictionary::<UInt64Type>();
929 }
930
931 fn _dictionary<K: ArrowDictionaryKeyType>() {
932 const TOTAL: i32 = 100;
933
934 let v = ["aaa", "bbb", "ccc", "ddd", "eee"];
935 let data: Vec<Option<&str>> = (0..TOTAL)
936 .map(|n| {
937 let i = n % 5;
938 if i == 3 { None } else { Some(v[i as usize]) }
939 })
940 .collect();
941
942 let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
943
944 let expected: Vec<Option<&str>> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect();
945
946 let res = substring(&dict_array, 1, Some(2)).unwrap();
947 let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
948 let actual: Vec<Option<&str>> = actual
949 .values()
950 .as_any()
951 .downcast_ref::<GenericStringArray<i32>>()
952 .unwrap()
953 .take_iter(actual.keys_iter())
954 .collect();
955
956 for i in 0..TOTAL as usize {
957 assert_eq!(expected[i], actual[i],);
958 }
959 }
960
961 #[test]
962 fn check_invalid_array_type() {
963 let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
964 let err = substring(&array, 0, None).unwrap_err().to_string();
965 assert!(err.contains("substring does not support type"));
966 }
967
968 #[test]
970 fn check_start_index() {
971 let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
972 let err = substring(&array, -1, None).unwrap_err().to_string();
973 assert!(err.contains("invalid utf-8 boundary"));
974 }
975
976 #[test]
977 fn check_length() {
978 let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
979 let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
980 assert!(err.contains("invalid utf-8 boundary"));
981 }
982
983 #[test]
984 fn non_utf8_bytes() {
985 let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD];
987 let array = BinaryArray::from(vec![Some(bytes)]);
988 let arr = substring(&array, 0, Some(5)).unwrap();
989 let actual = arr.as_any().downcast_ref::<BinaryArray>().unwrap();
990
991 let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5];
992 let expected = BinaryArray::from(vec![Some(expected_bytes)]);
993 assert_eq!(expected, *actual);
994 }
995}