1use arrow_array::builder::BufferBuilder;
23use arrow_array::types::*;
24use arrow_array::*;
25use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBuffer};
26use arrow_schema::{ArrowError, DataType};
27use num_traits::Zero;
28use std::cmp::Ordering;
29use std::sync::Arc;
30
31pub fn substring(
72 array: &dyn Array,
73 start: i64,
74 length: Option<u64>,
75) -> Result<ArrayRef, ArrowError> {
76 macro_rules! substring_dict {
77 ($kt: ident, $($t: ident: $gt: ident), *) => {
78 match $kt.as_ref() {
79 $(
80 &DataType::$t => {
81 let dict = array
82 .as_any()
83 .downcast_ref::<DictionaryArray<$gt>>()
84 .unwrap_or_else(|| {
85 panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
86 stringify!($gt), array.data_type())
87 });
88 let values = substring(dict.values(), start, length)?;
89 Ok(Arc::new(dict.with_values(values)))
90 },
91 )*
92 t => panic!("Unsupported dictionary key type: {}", t)
93 }
94 }
95 }
96
97 match array.data_type() {
98 DataType::Dictionary(kt, _) => {
99 substring_dict!(
100 kt,
101 Int8: Int8Type,
102 Int16: Int16Type,
103 Int32: Int32Type,
104 Int64: Int64Type,
105 UInt8: UInt8Type,
106 UInt16: UInt16Type,
107 UInt32: UInt32Type,
108 UInt64: UInt64Type
109 )
110 }
111 DataType::LargeBinary => byte_substring(
112 array
113 .as_any()
114 .downcast_ref::<LargeBinaryArray>()
115 .expect("A large binary is expected"),
116 start,
117 length.map(|e| e as i64),
118 ),
119 DataType::Binary => byte_substring(
120 array
121 .as_any()
122 .downcast_ref::<BinaryArray>()
123 .expect("A binary is expected"),
124 start as i32,
125 length.map(|e| e as i32),
126 ),
127 DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
128 array
129 .as_any()
130 .downcast_ref::<FixedSizeBinaryArray>()
131 .expect("a fixed size binary is expected"),
132 *old_len,
133 start as i32,
134 length.map(|e| e as i32),
135 ),
136 DataType::LargeUtf8 => byte_substring(
137 array
138 .as_any()
139 .downcast_ref::<LargeStringArray>()
140 .expect("A large string is expected"),
141 start,
142 length.map(|e| e as i64),
143 ),
144 DataType::Utf8 => byte_substring(
145 array
146 .as_any()
147 .downcast_ref::<StringArray>()
148 .expect("A string is expected"),
149 start as i32,
150 length.map(|e| e as i32),
151 ),
152 _ => Err(ArrowError::ComputeError(format!(
153 "substring does not support type {:?}",
154 array.data_type()
155 ))),
156 }
157}
158
159pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
188 array: &GenericStringArray<OffsetSize>,
189 start: i64,
190 length: Option<u64>,
191) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
192 let mut vals = BufferBuilder::<u8>::new({
193 let offsets = array.value_offsets();
194 (offsets[array.len()] - offsets[0]).to_usize().unwrap()
195 });
196 let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
197 new_offsets.append(OffsetSize::zero());
198 let length = length.map(|len| len.to_usize().unwrap());
199
200 array.iter().for_each(|val| {
201 if let Some(val) = val {
202 let char_count = val.chars().count();
203 let start = if start >= 0 {
204 start.to_usize().unwrap()
205 } else {
206 char_count - (-start).to_usize().unwrap().min(char_count)
207 };
208 let (start_offset, end_offset) = get_start_end_offset(val, start, length);
209 vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
210 }
211 new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
212 });
213 let offsets = OffsetBuffer::new(new_offsets.finish().into());
214 let values = vals.finish();
215 let nulls = array
216 .nulls()
217 .map(|n| n.inner().sliced())
218 .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
219 Ok(GenericStringArray::<OffsetSize>::new(
220 offsets, values, nulls,
221 ))
222}
223
224fn get_start_end_offset(val: &str, start: usize, length: Option<usize>) -> (usize, usize) {
230 let len = val.len();
231 let mut offset_char_iter = val.char_indices();
232 let start_offset = offset_char_iter
233 .nth(start)
234 .map_or(len, |(offset, _)| offset);
235 let end_offset = length.map_or(len, |length| {
236 if length > 0 {
237 offset_char_iter
238 .nth(length - 1)
239 .map_or(len, |(offset, _)| offset)
240 } else {
241 start_offset
242 }
243 });
244 (start_offset, end_offset)
245}
246
247fn byte_substring<T: ByteArrayType>(
248 array: &GenericByteArray<T>,
249 start: T::Offset,
250 length: Option<T::Offset>,
251) -> Result<ArrayRef, ArrowError>
252where
253 <T as ByteArrayType>::Native: PartialEq,
254{
255 let offsets = array.value_offsets();
256 let data = array.value_data();
257 let zero = <T::Offset as Zero>::zero();
258
259 let check_char_boundary = {
261 |offset: T::Offset| {
262 if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) {
263 return Ok(offset);
264 }
265 let data_str = unsafe { std::str::from_utf8_unchecked(data) };
267 let offset_usize = offset.as_usize();
268 if data_str.is_char_boundary(offset_usize) {
269 Ok(offset)
270 } else {
271 Err(ArrowError::ComputeError(format!(
272 "The offset {offset_usize} is at an invalid utf-8 boundary."
273 )))
274 }
275 }
276 };
277
278 let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len());
280 let mut new_offsets: Vec<T::Offset> = Vec::with_capacity(array.len() + 1);
281 let mut len_so_far = zero;
282 new_offsets.push(zero);
283
284 offsets
285 .windows(2)
286 .try_for_each(|pair| -> Result<(), ArrowError> {
287 let new_start = match start.cmp(&zero) {
288 Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
289 Ordering::Equal => pair[0],
290 Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
291 };
292 let new_end = match length {
293 Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
294 None => pair[1],
295 };
296 len_so_far += new_end - new_start;
297 new_starts_ends.push((new_start, new_end));
298 new_offsets.push(len_so_far);
299 Ok(())
300 })?;
301
302 let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
304
305 new_starts_ends
306 .iter()
307 .map(|(start, end)| {
308 let start = start.as_usize();
309 let end = end.as_usize();
310 &data[start..end]
311 })
312 .for_each(|slice| new_values.extend_from_slice(slice));
313
314 let offsets = OffsetBuffer::new(new_offsets.into());
315 let values = new_values.into();
316 let nulls = array
317 .nulls()
318 .map(|n| n.inner().sliced())
319 .and_then(|b| NullBuffer::from_unsliced_buffer(b, array.len()));
320 Ok(Arc::new(GenericByteArray::<T>::new(offsets, values, nulls)))
321}
322
323fn fixed_size_binary_substring(
324 array: &FixedSizeBinaryArray,
325 old_len: i32,
326 start: i32,
327 length: Option<i32>,
328) -> Result<ArrayRef, ArrowError> {
329 let new_start = if start >= 0 {
330 start.min(old_len)
331 } else {
332 (old_len + start).max(0)
333 };
334 let new_len = match length {
335 Some(len) => len.min(old_len - new_start),
336 None => old_len - new_start,
337 };
338
339 let num_of_elements = array.len();
341 let data = array.value_data();
342 let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize));
343 (0..num_of_elements)
344 .map(|idx| {
345 let offset = array.value_offset(idx);
346 (
347 (offset + new_start) as usize,
348 (offset + new_start + new_len) as usize,
349 )
350 })
351 .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
352
353 let mut nulls = array
354 .nulls()
355 .map(|n| n.inner().sliced())
356 .and_then(|b| NullBuffer::from_unsliced_buffer(b, num_of_elements));
357
358 if new_len == 0 && nulls.is_none() {
359 nulls = Some(NullBuffer::new_valid(num_of_elements));
364 }
365
366 Ok(Arc::new(FixedSizeBinaryArray::new(
367 new_len,
368 new_values.into(),
369 nulls,
370 )))
371}
372
373#[cfg(test)]
374mod tests {
375 use super::*;
376 use arrow_buffer::BooleanBuffer;
377 use arrow_buffer::Buffer;
378
379 macro_rules! gen_test_cases {
390 ($input:expr, $(($start:expr, $len:expr, $result:expr)), *) => {
391 [
392 $(
393 ($input.clone(), $start, $len, $result),
394 )*
395 ]
396 };
397 }
398
399 macro_rules! do_test {
406 ($cases:expr, $array_ty:ty, $substring_fn:ident) => {
407 $cases
408 .into_iter()
409 .for_each(|(array, start, length, expected)| {
410 let array = <$array_ty>::from(array);
411 let result = $substring_fn(&array, start, length).unwrap();
412 let result = result.as_any().downcast_ref::<$array_ty>().unwrap();
413 let expected = <$array_ty>::from(expected);
414 assert_eq!(&expected, result);
415 })
416 };
417 }
418
419 fn with_nulls_generic_binary<O: OffsetSizeTrait>() {
420 let input = vec![
421 Some("hello".as_bytes()),
422 None,
423 Some(&[0xf8, 0xf9, 0xff, 0xfa]),
424 ];
425 let base_case = gen_test_cases!(
427 vec![None, None, None],
428 (-1, Some(1), vec![None, None, None])
429 );
430 let cases = gen_test_cases!(
431 input,
432 (0, None, input.clone()),
434 (0, Some(0), vec![Some(&[]), None, Some(&[])]),
436 (1000, Some(0), vec![Some(&[]), None, Some(&[])]),
438 (-1000, None, input.clone()),
440 (0, Some(1000), input.clone())
442 );
443
444 do_test!(
445 [&base_case[..], &cases[..]].concat(),
446 GenericBinaryArray<O>,
447 substring
448 );
449 }
450
451 #[test]
452 fn with_nulls_binary() {
453 with_nulls_generic_binary::<i32>()
454 }
455
456 #[test]
457 fn with_nulls_large_binary() {
458 with_nulls_generic_binary::<i64>()
459 }
460
461 fn without_nulls_generic_binary<O: OffsetSizeTrait>() {
462 let input = vec!["hello".as_bytes(), b"", &[0xf8, 0xf9, 0xff, 0xfa]];
463 let base_case = gen_test_cases!(
465 vec!["".as_bytes(), b"", b""],
466 (2, Some(1), vec!["".as_bytes(), b"", b""])
467 );
468 let cases = gen_test_cases!(
469 input,
470 (0, None, input.clone()),
472 (1, None, vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
474 (2, None, vec![b"llo", b"", &[0xff, 0xfa]]),
475 (3, None, vec![b"lo", b"", &[0xfa]]),
476 (10, None, vec![b"", b"", b""]),
477 (-1, None, vec![b"o", b"", &[0xfa]]),
479 (-2, None, vec![b"lo", b"", &[0xff, 0xfa]]),
480 (-3, None, vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
481 (-10, None, input.clone()),
482 (1, Some(1), vec![b"e", b"", &[0xf9]]),
484 (1, Some(2), vec![b"el", b"", &[0xf9, 0xff]]),
485 (1, Some(3), vec![b"ell", b"", &[0xf9, 0xff, 0xfa]]),
486 (1, Some(4), vec![b"ello", b"", &[0xf9, 0xff, 0xfa]]),
487 (-3, Some(1), vec![b"l", b"", &[0xf9]]),
488 (-3, Some(2), vec![b"ll", b"", &[0xf9, 0xff]]),
489 (-3, Some(3), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]]),
490 (-3, Some(4), vec![b"llo", b"", &[0xf9, 0xff, 0xfa]])
491 );
492
493 do_test!(
494 [&base_case[..], &cases[..]].concat(),
495 GenericBinaryArray<O>,
496 substring
497 );
498 }
499
500 #[test]
501 fn without_nulls_binary() {
502 without_nulls_generic_binary::<i32>()
503 }
504
505 #[test]
506 fn without_nulls_large_binary() {
507 without_nulls_generic_binary::<i64>()
508 }
509
510 fn generic_binary_with_non_zero_offset<O: OffsetSizeTrait>() {
511 let values = 0_u8..15;
512 let offsets = &[
513 O::zero(),
514 O::from_usize(5).unwrap(),
515 O::from_usize(10).unwrap(),
516 O::from_usize(15).unwrap(),
517 ];
518 let bitmap = [0b101_u8];
520
521 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
522 let values = Buffer::from_iter(values);
523 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
524 Buffer::from(bitmap),
525 0,
526 3,
527 )));
528 let array = GenericBinaryArray::<O>::new(offsets, values, nulls).slice(1, 2);
530 let result = substring(&array, 1, None).unwrap();
532 let result = result
533 .as_any()
534 .downcast_ref::<GenericBinaryArray<O>>()
535 .unwrap();
536 let expected =
537 GenericBinaryArray::<O>::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]);
538 assert_eq!(result, &expected);
539 }
540
541 #[test]
542 fn binary_with_non_zero_offset() {
543 generic_binary_with_non_zero_offset::<i32>()
544 }
545
546 #[test]
547 fn large_binary_with_non_zero_offset() {
548 generic_binary_with_non_zero_offset::<i64>()
549 }
550
551 #[test]
552 fn with_nulls_fixed_size_binary() {
553 let input = vec![Some("cat".as_bytes()), None, Some(&[0xf8, 0xf9, 0xff])];
554 let base_case =
556 gen_test_cases!(vec![None, None, None], (3, Some(2), vec![None, None, None]));
557 let cases = gen_test_cases!(
558 input,
559 (0, None, input.clone()),
561 (1, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
563 (2, None, vec![Some(b"t"), None, Some(&[0xff])]),
564 (3, None, vec![Some(b""), None, Some(b"")]),
565 (10, None, vec![Some(b""), None, Some(b"")]),
566 (-1, None, vec![Some(b"t"), None, Some(&[0xff])]),
568 (-2, None, vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
569 (-3, None, input.clone()),
570 (-10, None, input.clone()),
571 (1, Some(1), vec![Some(b"a"), None, Some(&[0xf9])]),
573 (1, Some(2), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
574 (1, Some(3), vec![Some(b"at"), None, Some(&[0xf9, 0xff])]),
575 (-3, Some(1), vec![Some(b"c"), None, Some(&[0xf8])]),
576 (-3, Some(2), vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])]),
577 (-3, Some(3), input.clone()),
578 (-3, Some(4), input.clone())
579 );
580
581 do_test!(
582 [&base_case[..], &cases[..]].concat(),
583 FixedSizeBinaryArray,
584 substring
585 );
586 }
587
588 #[test]
589 fn without_nulls_fixed_size_binary() {
590 let input = vec!["cat".as_bytes(), b"dog", &[0xf8, 0xf9, 0xff]];
591 let base_case = gen_test_cases!(
593 vec!["".as_bytes(), &[], &[]],
594 (1, Some(2), vec!["".as_bytes(), &[], &[]])
595 );
596 let cases = gen_test_cases!(
597 input,
598 (0, None, input.clone()),
600 (1, None, vec![b"at", b"og", &[0xf9, 0xff]]),
602 (2, None, vec![b"t", b"g", &[0xff]]),
603 (3, None, vec![&[], &[], &[]]),
604 (10, None, vec![&[], &[], &[]]),
605 (-1, None, vec![b"t", b"g", &[0xff]]),
607 (-2, None, vec![b"at", b"og", &[0xf9, 0xff]]),
608 (-3, None, input.clone()),
609 (-10, None, input.clone()),
610 (1, Some(1), vec![b"a", b"o", &[0xf9]]),
612 (1, Some(2), vec![b"at", b"og", &[0xf9, 0xff]]),
613 (1, Some(3), vec![b"at", b"og", &[0xf9, 0xff]]),
614 (-3, Some(1), vec![b"c", b"d", &[0xf8]]),
615 (-3, Some(2), vec![b"ca", b"do", &[0xf8, 0xf9]]),
616 (-3, Some(3), input.clone()),
617 (-3, Some(4), input.clone())
618 );
619
620 do_test!(
621 [&base_case[..], &cases[..]].concat(),
622 FixedSizeBinaryArray,
623 substring
624 );
625 }
626
627 #[test]
628 fn fixed_size_binary_with_non_zero_offset() {
629 let values: [u8; 15] = *b"hellotherearrow";
630 let bits_v = [0b101_u8];
632
633 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
634 Buffer::from(bits_v),
635 0,
636 3,
637 )));
638 let array = FixedSizeBinaryArray::new(5, Buffer::from(&values), nulls).slice(1, 2);
640 let result = substring(&array, 1, None).unwrap();
642 let result = result
643 .as_any()
644 .downcast_ref::<FixedSizeBinaryArray>()
645 .unwrap();
646 let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
647 vec![None, Some(b"rrow")].into_iter(),
648 4,
649 )
650 .unwrap();
651 assert_eq!(result, &expected);
652 }
653
654 fn with_nulls_generic_string<O: OffsetSizeTrait>() {
655 let input = vec![Some("hello"), None, Some("word")];
656 let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
658 let cases = gen_test_cases!(
659 input,
660 (0, None, input.clone()),
662 (0, Some(0), vec![Some(""), None, Some("")]),
664 (1000, Some(0), vec![Some(""), None, Some("")]),
666 (-1000, None, input.clone()),
668 (0, Some(1000), input.clone())
670 );
671
672 do_test!(
673 [&base_case[..], &cases[..]].concat(),
674 GenericStringArray<O>,
675 substring
676 );
677 }
678
679 #[test]
680 fn with_nulls_string() {
681 with_nulls_generic_string::<i32>()
682 }
683
684 #[test]
685 fn with_nulls_large_string() {
686 with_nulls_generic_string::<i64>()
687 }
688
689 fn without_nulls_generic_string<O: OffsetSizeTrait>() {
690 let input = vec!["hello", "", "word"];
691 let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
693 let cases = gen_test_cases!(
694 input,
695 (0, None, input.clone()),
697 (1, None, vec!["ello", "", "ord"]),
698 (2, None, vec!["llo", "", "rd"]),
699 (3, None, vec!["lo", "", "d"]),
700 (10, None, vec!["", "", ""]),
701 (-1, None, vec!["o", "", "d"]),
703 (-2, None, vec!["lo", "", "rd"]),
704 (-3, None, vec!["llo", "", "ord"]),
705 (-10, None, input.clone()),
706 (1, Some(1), vec!["e", "", "o"]),
708 (1, Some(2), vec!["el", "", "or"]),
709 (1, Some(3), vec!["ell", "", "ord"]),
710 (1, Some(4), vec!["ello", "", "ord"]),
711 (-3, Some(1), vec!["l", "", "o"]),
712 (-3, Some(2), vec!["ll", "", "or"]),
713 (-3, Some(3), vec!["llo", "", "ord"]),
714 (-3, Some(4), vec!["llo", "", "ord"])
715 );
716
717 do_test!(
718 [&base_case[..], &cases[..]].concat(),
719 GenericStringArray<O>,
720 substring
721 );
722 }
723
724 #[test]
725 fn without_nulls_string() {
726 without_nulls_generic_string::<i32>()
727 }
728
729 #[test]
730 fn without_nulls_large_string() {
731 without_nulls_generic_string::<i64>()
732 }
733
734 fn generic_string_with_non_zero_offset<O: OffsetSizeTrait>() {
735 let values = b"hellotherearrow";
736 let offsets = &[
737 O::zero(),
738 O::from_usize(5).unwrap(),
739 O::from_usize(10).unwrap(),
740 O::from_usize(15).unwrap(),
741 ];
742 let bitmap = [0b101_u8];
744
745 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
746 let values = Buffer::from(values);
747 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
748 Buffer::from(bitmap),
749 0,
750 3,
751 )));
752 let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
754 let result = substring(&array, 1, None).unwrap();
756 let result = result
757 .as_any()
758 .downcast_ref::<GenericStringArray<O>>()
759 .unwrap();
760 let expected = GenericStringArray::<O>::from(vec![None, Some("rrow")]);
761 assert_eq!(result, &expected);
762 }
763
764 #[test]
765 fn string_with_non_zero_offset() {
766 generic_string_with_non_zero_offset::<i32>()
767 }
768
769 #[test]
770 fn large_string_with_non_zero_offset() {
771 generic_string_with_non_zero_offset::<i64>()
772 }
773
774 fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
775 let input = vec![Some("hello"), None, Some("Γ ⊢x:T")];
776 let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None]));
778 let cases = gen_test_cases!(
779 input,
780 (0, None, input.clone()),
782 (0, Some(0), vec![Some(""), None, Some("")]),
784 (1000, Some(0), vec![Some(""), None, Some("")]),
786 (-1000, None, input.clone()),
788 (0, Some(1000), input.clone())
790 );
791
792 do_test!(
793 [&base_case[..], &cases[..]].concat(),
794 GenericStringArray<O>,
795 substring_by_char
796 );
797 }
798
799 #[test]
800 fn with_nulls_string_by_char() {
801 with_nulls_generic_string_by_char::<i32>()
802 }
803
804 #[test]
805 fn with_nulls_large_string_by_char() {
806 with_nulls_generic_string_by_char::<i64>()
807 }
808
809 fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() {
810 let input = vec!["hello", "", "Γ ⊢x:T"];
811 let base_case = gen_test_cases!(vec!["", "", ""], (0, None, vec!["", "", ""]));
813 let cases = gen_test_cases!(
814 input,
815 (0, None, input.clone()),
817 (1, None, vec!["ello", "", " ⊢x:T"]),
819 (2, None, vec!["llo", "", "⊢x:T"]),
820 (3, None, vec!["lo", "", "x:T"]),
821 (10, None, vec!["", "", ""]),
822 (-1, None, vec!["o", "", "T"]),
824 (-2, None, vec!["lo", "", ":T"]),
825 (-4, None, vec!["ello", "", "⊢x:T"]),
826 (-10, None, input.clone()),
827 (1, Some(1), vec!["e", "", " "]),
829 (1, Some(2), vec!["el", "", " ⊢"]),
830 (1, Some(3), vec!["ell", "", " ⊢x"]),
831 (1, Some(6), vec!["ello", "", " ⊢x:T"]),
832 (-4, Some(1), vec!["e", "", "⊢"]),
833 (-4, Some(2), vec!["el", "", "⊢x"]),
834 (-4, Some(3), vec!["ell", "", "⊢x:"]),
835 (-4, Some(4), vec!["ello", "", "⊢x:T"])
836 );
837
838 do_test!(
839 [&base_case[..], &cases[..]].concat(),
840 GenericStringArray<O>,
841 substring_by_char
842 );
843 }
844
845 #[test]
846 fn without_nulls_string_by_char() {
847 without_nulls_generic_string_by_char::<i32>()
848 }
849
850 #[test]
851 fn without_nulls_large_string_by_char() {
852 without_nulls_generic_string_by_char::<i64>()
853 }
854
855 fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() {
856 let values = "S→T = Πx:S.T";
857 let offsets = &[
858 O::zero(),
859 O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(),
860 O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(),
861 O::from_usize(values.len()).unwrap(),
862 ];
863 let bitmap = [0b101_u8];
865
866 let offsets = OffsetBuffer::new(Buffer::from_slice_ref(offsets).into());
867 let values = Buffer::from(values.as_bytes());
868 let nulls = Some(NullBuffer::new(BooleanBuffer::new(
869 Buffer::from(bitmap),
870 0,
871 3,
872 )));
873 let array = GenericStringArray::<O>::new(offsets, values, nulls).slice(1, 2);
875 let result = substring_by_char(&array, 1, None).unwrap();
877 let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
878 assert_eq!(result, expected);
879 }
880
881 #[test]
882 fn string_with_non_zero_offset_by_char() {
883 generic_string_by_char_with_non_zero_offset::<i32>()
884 }
885
886 #[test]
887 fn large_string_with_non_zero_offset_by_char() {
888 generic_string_by_char_with_non_zero_offset::<i64>()
889 }
890
891 #[test]
892 fn dictionary() {
893 _dictionary::<Int8Type>();
894 _dictionary::<Int16Type>();
895 _dictionary::<Int32Type>();
896 _dictionary::<Int64Type>();
897 _dictionary::<UInt8Type>();
898 _dictionary::<UInt16Type>();
899 _dictionary::<UInt32Type>();
900 _dictionary::<UInt64Type>();
901 }
902
903 fn _dictionary<K: ArrowDictionaryKeyType>() {
904 const TOTAL: i32 = 100;
905
906 let v = ["aaa", "bbb", "ccc", "ddd", "eee"];
907 let data: Vec<Option<&str>> = (0..TOTAL)
908 .map(|n| {
909 let i = n % 5;
910 if i == 3 { None } else { Some(v[i as usize]) }
911 })
912 .collect();
913
914 let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
915
916 let expected: Vec<Option<&str>> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect();
917
918 let res = substring(&dict_array, 1, Some(2)).unwrap();
919 let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
920 let actual: Vec<Option<&str>> = actual
921 .values()
922 .as_any()
923 .downcast_ref::<GenericStringArray<i32>>()
924 .unwrap()
925 .take_iter(actual.keys_iter())
926 .collect();
927
928 for i in 0..TOTAL as usize {
929 assert_eq!(expected[i], actual[i],);
930 }
931 }
932
933 #[test]
934 fn check_invalid_array_type() {
935 let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
936 let err = substring(&array, 0, None).unwrap_err().to_string();
937 assert!(err.contains("substring does not support type"));
938 }
939
940 #[test]
942 fn check_start_index() {
943 let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
944 let err = substring(&array, -1, None).unwrap_err().to_string();
945 assert!(err.contains("invalid utf-8 boundary"));
946 }
947
948 #[test]
949 fn check_length() {
950 let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
951 let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
952 assert!(err.contains("invalid utf-8 boundary"));
953 }
954
955 #[test]
956 fn non_utf8_bytes() {
957 let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD];
959 let array = BinaryArray::from(vec![Some(bytes)]);
960 let arr = substring(&array, 0, Some(5)).unwrap();
961 let actual = arr.as_any().downcast_ref::<BinaryArray>().unwrap();
962
963 let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5];
964 let expected = BinaryArray::from(vec![Some(expected_bytes)]);
965 assert_eq!(expected, *actual);
966 }
967}