1use crate::builder::ArrayBuilder;
19use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
21use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder, ScalarBuffer};
22use arrow_data::ArrayDataBuilder;
23use arrow_schema::ArrowError;
24use std::any::Any;
25use std::sync::Arc;
26
27pub struct GenericByteBuilder<T: ByteArrayType> {
32 value_builder: Vec<u8>,
33 offsets_builder: Vec<T::Offset>,
34 null_buffer_builder: NullBufferBuilder,
35}
36
37impl<T: ByteArrayType> GenericByteBuilder<T> {
38 pub fn new() -> Self {
40 Self::with_capacity(1024, 1024)
41 }
42
43 pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
50 let mut offsets_builder = Vec::with_capacity(item_capacity + 1);
51 offsets_builder.push(T::Offset::from_usize(0).unwrap());
52 Self {
53 value_builder: Vec::with_capacity(data_capacity),
54 offsets_builder,
55 null_buffer_builder: NullBufferBuilder::new(item_capacity),
56 }
57 }
58
59 pub unsafe fn new_from_buffer(
66 offsets_buffer: MutableBuffer,
67 value_buffer: MutableBuffer,
68 null_buffer: Option<MutableBuffer>,
69 ) -> Self {
70 let offsets_builder: Vec<T::Offset> =
71 ScalarBuffer::<T::Offset>::from(offsets_buffer).into();
72 let value_builder: Vec<u8> = ScalarBuffer::<u8>::from(value_buffer).into();
73
74 let null_buffer_builder = null_buffer
75 .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
76 .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
77
78 Self {
79 offsets_builder,
80 value_builder,
81 null_buffer_builder,
82 }
83 }
84
85 #[inline]
86 fn next_offset(&self) -> T::Offset {
87 T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
88 }
89
90 #[inline]
106 pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
107 self.value_builder
108 .extend_from_slice(value.as_ref().as_ref());
109 self.null_buffer_builder.append(true);
110 self.offsets_builder.push(self.next_offset());
111 }
112
113 #[inline]
120 pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
121 match value {
122 None => self.append_null(),
123 Some(v) => self.append_value(v),
124 };
125 }
126
127 #[inline]
129 pub fn append_null(&mut self) {
130 self.null_buffer_builder.append(false);
131 self.offsets_builder.push(self.next_offset());
132 }
133
134 #[inline]
136 pub fn append_nulls(&mut self, n: usize) {
137 self.null_buffer_builder.append_n_nulls(n);
138 let next_offset = self.next_offset();
139 self.offsets_builder
140 .extend(std::iter::repeat_n(next_offset, n));
141 }
142
143 #[inline]
146 pub fn append_array(&mut self, array: &GenericByteArray<T>) -> Result<(), ArrowError> {
147 use num_traits::CheckedAdd;
148 if array.len() == 0 {
149 return Ok(());
150 }
151
152 let offsets = array.offsets();
153
154 if self.next_offset() == offsets[0] {
157 self.offsets_builder.extend_from_slice(&offsets[1..]);
158 } else {
159 let shift: T::Offset = self.next_offset() - offsets[0];
161
162 if shift.checked_add(&offsets[offsets.len() - 1]).is_none() {
163 return Err(ArrowError::OffsetOverflowError(
164 shift.as_usize() + offsets[offsets.len() - 1].as_usize(),
165 ));
166 }
167
168 self.offsets_builder
169 .extend(offsets[1..].iter().map(|&offset| offset + shift));
170 }
171
172 self.value_builder.extend_from_slice(
174 &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
175 );
176
177 if let Some(null_buffer) = array.nulls() {
178 self.null_buffer_builder.append_buffer(null_buffer);
179 } else {
180 self.null_buffer_builder.append_n_non_nulls(array.len());
181 }
182 Ok(())
183 }
184
185 pub fn finish(&mut self) -> GenericByteArray<T> {
187 let array_type = T::DATA_TYPE;
188 let array_builder = ArrayDataBuilder::new(array_type)
189 .len(self.len())
190 .add_buffer(std::mem::take(&mut self.offsets_builder).into())
191 .add_buffer(std::mem::take(&mut self.value_builder).into())
192 .nulls(self.null_buffer_builder.finish());
193
194 self.offsets_builder.push(self.next_offset());
195 let array_data = unsafe { array_builder.build_unchecked() };
196 GenericByteArray::from(array_data)
197 }
198
199 pub fn finish_cloned(&self) -> GenericByteArray<T> {
201 let array_type = T::DATA_TYPE;
202 let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
203 let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
204 let array_builder = ArrayDataBuilder::new(array_type)
205 .len(self.len())
206 .add_buffer(offset_buffer)
207 .add_buffer(value_buffer)
208 .nulls(self.null_buffer_builder.finish_cloned());
209
210 let array_data = unsafe { array_builder.build_unchecked() };
211 GenericByteArray::from(array_data)
212 }
213
214 pub fn values_slice(&self) -> &[u8] {
216 self.value_builder.as_slice()
217 }
218
219 pub fn offsets_slice(&self) -> &[T::Offset] {
221 self.offsets_builder.as_slice()
222 }
223
224 pub fn validity_slice(&self) -> Option<&[u8]> {
226 self.null_buffer_builder.as_slice()
227 }
228
229 pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
231 self.null_buffer_builder.as_slice_mut()
232 }
233}
234
235impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
236 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
237 write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
238 f.debug_struct("")
239 .field("value_builder", &self.value_builder)
240 .field("offsets_builder", &self.offsets_builder)
241 .field("null_buffer_builder", &self.null_buffer_builder)
242 .finish()
243 }
244}
245
246impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
247 fn default() -> Self {
248 Self::new()
249 }
250}
251
252impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
253 fn len(&self) -> usize {
255 self.null_buffer_builder.len()
256 }
257
258 fn finish(&mut self) -> ArrayRef {
260 Arc::new(self.finish())
261 }
262
263 fn finish_cloned(&self) -> ArrayRef {
265 Arc::new(self.finish_cloned())
266 }
267
268 fn as_any(&self) -> &dyn Any {
270 self
271 }
272
273 fn as_any_mut(&mut self) -> &mut dyn Any {
275 self
276 }
277
278 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
280 self
281 }
282}
283
284impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
285 #[inline]
286 fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
287 for v in iter {
288 self.append_option(v)
289 }
290 }
291}
292
293pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
343
344impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
345 fn write_str(&mut self, s: &str) -> std::fmt::Result {
346 self.value_builder.extend_from_slice(s.as_bytes());
347 Ok(())
348 }
349}
350
351pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
397
398impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> {
399 fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> {
400 self.value_builder.extend_from_slice(bs);
401 Ok(bs.len())
402 }
403
404 fn flush(&mut self) -> std::io::Result<()> {
405 Ok(())
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412 use crate::GenericStringArray;
413 use crate::array::Array;
414 use arrow_buffer::NullBuffer;
415 use std::fmt::Write as _;
416 use std::io::Write as _;
417
418 fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
419 let mut builder = GenericBinaryBuilder::<O>::new();
420
421 builder.append_value(b"hello");
422 builder.append_value(b"");
423 builder.append_null();
424 builder.append_value(b"rust");
425
426 let array = builder.finish();
427
428 assert_eq!(4, array.len());
429 assert_eq!(1, array.null_count());
430 assert_eq!(b"hello", array.value(0));
431 assert_eq!([] as [u8; 0], array.value(1));
432 assert!(array.is_null(2));
433 assert_eq!(b"rust", array.value(3));
434 assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
435 assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
436 }
437
438 #[test]
439 fn test_binary_builder() {
440 _test_generic_binary_builder::<i32>()
441 }
442
443 #[test]
444 fn test_large_binary_builder() {
445 _test_generic_binary_builder::<i64>()
446 }
447
448 fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
449 let mut builder = GenericBinaryBuilder::<O>::new();
450 builder.append_null();
451 builder.append_null();
452 builder.append_null();
453 builder.append_nulls(2);
454 assert_eq!(5, builder.len());
455 assert!(!builder.is_empty());
456
457 let array = builder.finish();
458 assert_eq!(5, array.null_count());
459 assert_eq!(5, array.len());
460 assert!(array.is_null(0));
461 assert!(array.is_null(1));
462 assert!(array.is_null(2));
463 assert!(array.is_null(3));
464 assert!(array.is_null(4));
465 }
466
467 #[test]
468 fn test_binary_builder_all_nulls() {
469 _test_generic_binary_builder_all_nulls::<i32>()
470 }
471
472 #[test]
473 fn test_large_binary_builder_all_nulls() {
474 _test_generic_binary_builder_all_nulls::<i64>()
475 }
476
477 fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
478 let mut builder = GenericBinaryBuilder::<O>::new();
479
480 builder.append_value(b"hello");
481 builder.append_value(b"");
482 builder.append_null();
483 builder.append_value(b"rust");
484 builder.finish();
485
486 assert!(builder.is_empty());
487
488 builder.append_value(b"parquet");
489 builder.append_null();
490 builder.append_value(b"arrow");
491 builder.append_value(b"");
492 builder.append_nulls(2);
493 builder.append_value(b"hi");
494 let array = builder.finish();
495
496 assert_eq!(7, array.len());
497 assert_eq!(3, array.null_count());
498 assert_eq!(b"parquet", array.value(0));
499 assert!(array.is_null(1));
500 assert!(array.is_null(4));
501 assert!(array.is_null(5));
502 assert_eq!(b"arrow", array.value(2));
503 assert_eq!(b"", array.value(1));
504 assert_eq!(b"hi", array.value(6));
505
506 assert_eq!(O::zero(), array.value_offsets()[0]);
507 assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
508 assert_eq!(O::from_usize(14).unwrap(), array.value_offsets()[7]);
509 assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
510 }
511
512 #[test]
513 fn test_binary_builder_reset() {
514 _test_generic_binary_builder_reset::<i32>()
515 }
516
517 #[test]
518 fn test_large_binary_builder_reset() {
519 _test_generic_binary_builder_reset::<i64>()
520 }
521
522 fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
523 let mut builder = GenericStringBuilder::<O>::new();
524 let owned = "arrow".to_owned();
525
526 builder.append_value("hello");
527 builder.append_value("");
528 builder.append_value(&owned);
529 builder.append_null();
530 builder.append_option(Some("rust"));
531 builder.append_option(None::<&str>);
532 builder.append_option(None::<String>);
533 builder.append_nulls(2);
534 builder.append_value("parquet");
535 assert_eq!(10, builder.len());
536
537 assert_eq!(
538 GenericStringArray::<O>::from(vec![
539 Some("hello"),
540 Some(""),
541 Some("arrow"),
542 None,
543 Some("rust"),
544 None,
545 None,
546 None,
547 None,
548 Some("parquet")
549 ]),
550 builder.finish()
551 );
552 }
553
554 #[test]
555 fn test_string_array_builder() {
556 _test_generic_string_array_builder::<i32>()
557 }
558
559 #[test]
560 fn test_large_string_array_builder() {
561 _test_generic_string_array_builder::<i64>()
562 }
563
564 fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
565 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
566
567 builder.append_value("hello");
568 builder.append_value("rust");
569 builder.append_null();
570
571 builder.finish();
572 assert!(builder.is_empty());
573 assert_eq!(&[O::zero()], builder.offsets_slice());
574
575 builder.append_value("arrow");
576 builder.append_value("parquet");
577 let arr = builder.finish();
578 assert!(arr.nulls().is_none());
580 assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
581 }
582
583 #[test]
584 fn test_string_array_builder_finish() {
585 _test_generic_string_array_builder_finish::<i32>()
586 }
587
588 #[test]
589 fn test_large_string_array_builder_finish() {
590 _test_generic_string_array_builder_finish::<i64>()
591 }
592
593 fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
594 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
595
596 builder.append_value("hello");
597 builder.append_value("rust");
598 builder.append_null();
599
600 let mut arr = builder.finish_cloned();
601 assert!(!builder.is_empty());
602 assert_eq!(3, arr.len());
603
604 builder.append_value("arrow");
605 builder.append_value("parquet");
606 arr = builder.finish();
607
608 assert!(arr.nulls().is_some());
609 assert_eq!(&[O::zero()], builder.offsets_slice());
610 assert_eq!(5, arr.len());
611 }
612
613 #[test]
614 fn test_string_array_builder_finish_cloned() {
615 _test_generic_string_array_builder_finish_cloned::<i32>()
616 }
617
618 #[test]
619 fn test_large_string_array_builder_finish_cloned() {
620 _test_generic_string_array_builder_finish_cloned::<i64>()
621 }
622
623 #[test]
624 fn test_extend() {
625 let mut builder = GenericStringBuilder::<i32>::new();
626 builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
627 builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
628 let array = builder.finish();
629 assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
630 assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
631 }
632
633 #[test]
634 fn test_write_str() {
635 let mut builder = GenericStringBuilder::<i32>::new();
636 write!(builder, "foo").unwrap();
637 builder.append_value("");
638 writeln!(builder, "bar").unwrap();
639 builder.append_value("");
640 write!(builder, "fiz").unwrap();
641 write!(builder, "buz").unwrap();
642 builder.append_value("");
643 let a = builder.finish();
644 let r: Vec<_> = a.iter().flatten().collect();
645 assert_eq!(r, &["foo", "bar\n", "fizbuz"])
646 }
647
648 #[test]
649 fn test_write_bytes() {
650 let mut builder = GenericBinaryBuilder::<i32>::new();
651 write!(builder, "foo").unwrap();
652 builder.append_value("");
653 writeln!(builder, "bar").unwrap();
654 builder.append_value("");
655 write!(builder, "fiz").unwrap();
656 write!(builder, "buz").unwrap();
657 builder.append_value("");
658 let a = builder.finish();
659 let r: Vec<_> = a.iter().flatten().collect();
660 assert_eq!(
661 r,
662 &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
663 )
664 }
665
666 #[test]
667 fn test_append_array_without_nulls() {
668 let input = vec![
669 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
670 "thank", "you", "for", "asking",
671 ];
672 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
673 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
674 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
675
676 let mut builder = GenericStringBuilder::<i32>::new();
677 builder.append_array(&arr1).unwrap();
678 builder.append_array(&arr2).unwrap();
679 builder.append_array(&arr3).unwrap();
680
681 let actual = builder.finish();
682 let expected = GenericStringArray::<i32>::from(input);
683
684 assert_eq!(actual, expected);
685 }
686
687 #[test]
688 fn test_append_array_with_nulls() {
689 let input = vec![
690 Some("hello"),
691 None,
692 Some("how"),
693 None,
694 None,
695 None,
696 None,
697 Some("I"),
698 Some("am"),
699 Some("doing"),
700 Some("well"),
701 ];
702 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
703 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
704 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
705
706 let mut builder = GenericStringBuilder::<i32>::new();
707 builder.append_array(&arr1).unwrap();
708 builder.append_array(&arr2).unwrap();
709 builder.append_array(&arr3).unwrap();
710
711 let actual = builder.finish();
712 let expected = GenericStringArray::<i32>::from(input);
713
714 assert_eq!(actual, expected);
715 }
716
717 #[test]
718 fn test_append_empty_array() {
719 let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
720 let mut builder = GenericStringBuilder::<i32>::new();
721 builder.append_array(&arr).unwrap();
722 let result = builder.finish();
723 assert_eq!(result.len(), 0);
724 }
725
726 #[test]
727 fn test_append_array_with_offset_not_starting_at_0() {
728 let input = vec![
729 Some("hello"),
730 None,
731 Some("how"),
732 None,
733 None,
734 None,
735 None,
736 Some("I"),
737 Some("am"),
738 Some("doing"),
739 Some("well"),
740 ];
741 let full_array = GenericStringArray::<i32>::from(input);
742 let sliced = full_array.slice(1, 4);
743
744 assert_ne!(sliced.offsets()[0].as_usize(), 0);
745 assert_ne!(sliced.offsets().last(), full_array.offsets().last());
746
747 let mut builder = GenericStringBuilder::<i32>::new();
748 builder.append_array(&sliced).unwrap();
749 let actual = builder.finish();
750
751 let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);
752
753 assert_eq!(actual, expected);
754 }
755
756 #[test]
757 fn test_append_underlying_null_values_added_as_is() {
758 let input_1_array_with_nulls = {
759 let input = vec![
760 "hello", "world", "how", "are", "you", "doing", "today", "I", "am",
761 ];
762 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
763
764 GenericStringArray::<i32>::new(
765 offsets,
766 buffer,
767 Some(NullBuffer::from(&[
768 true, false, true, false, false, true, true, true, false,
769 ])),
770 )
771 };
772 let input_2_array_with_nulls = {
773 let input = vec!["doing", "well", "thank", "you", "for", "asking"];
774 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
775
776 GenericStringArray::<i32>::new(
777 offsets,
778 buffer,
779 Some(NullBuffer::from(&[false, false, true, false, true, true])),
780 )
781 };
782
783 let mut builder = GenericStringBuilder::<i32>::new();
784 builder.append_array(&input_1_array_with_nulls).unwrap();
785 builder.append_array(&input_2_array_with_nulls).unwrap();
786
787 let actual = builder.finish();
788 let expected = GenericStringArray::<i32>::from(vec![
789 Some("hello"),
790 None, Some("how"),
792 None, None, Some("doing"),
795 Some("today"),
796 Some("I"),
797 None, None, None, Some("thank"),
801 None, Some("for"),
803 Some("asking"),
804 ]);
805
806 assert_eq!(actual, expected);
807
808 let expected_underlying_buffer = Buffer::from(
809 [
810 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
811 "well", "thank", "you", "for", "asking",
812 ]
813 .join("")
814 .as_bytes(),
815 );
816 assert_eq!(actual.values(), &expected_underlying_buffer);
817 }
818
819 #[test]
820 fn append_array_with_continues_indices() {
821 let input = vec![
822 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
823 "thank", "you", "for", "asking",
824 ];
825 let full_array = GenericStringArray::<i32>::from(input);
826 let slice1 = full_array.slice(0, 3);
827 let slice2 = full_array.slice(3, 4);
828 let slice3 = full_array.slice(7, full_array.len() - 7);
829
830 let mut builder = GenericStringBuilder::<i32>::new();
831 builder.append_array(&slice1).unwrap();
832 builder.append_array(&slice2).unwrap();
833 builder.append_array(&slice3).unwrap();
834
835 let actual = builder.finish();
836
837 assert_eq!(actual, full_array);
838 }
839
840 #[test]
841 fn test_append_array_offset_overflow_precise() {
842 let mut builder = GenericStringBuilder::<i32>::new();
843
844 let initial_string = "x".repeat(i32::MAX as usize - 100);
845 builder.append_value(&initial_string);
846
847 let overflow_string = "y".repeat(200);
848 let overflow_array = GenericStringArray::<i32>::from(vec![overflow_string.as_str()]);
849
850 let result = builder.append_array(&overflow_array);
851
852 assert!(matches!(result, Err(ArrowError::OffsetOverflowError(_))));
853 }
854}