1use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
19use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
21use arrow_buffer::NullBufferBuilder;
22use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
23use arrow_data::ArrayDataBuilder;
24use std::any::Any;
25use std::sync::Arc;
26
27pub struct GenericByteBuilder<T: ByteArrayType> {
32 value_builder: UInt8BufferBuilder,
33 offsets_builder: BufferBuilder<T::Offset>,
34 null_buffer_builder: NullBufferBuilder,
35}
36
37impl<T: ByteArrayType> GenericByteBuilder<T> {
38 pub fn new() -> Self {
40 Self::with_capacity(1024, 1024)
41 }
42
43 pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
50 let mut offsets_builder = BufferBuilder::<T::Offset>::new(item_capacity + 1);
51 offsets_builder.append(T::Offset::from_usize(0).unwrap());
52 Self {
53 value_builder: UInt8BufferBuilder::new(data_capacity),
54 offsets_builder,
55 null_buffer_builder: NullBufferBuilder::new(item_capacity),
56 }
57 }
58
59 pub unsafe fn new_from_buffer(
66 offsets_buffer: MutableBuffer,
67 value_buffer: MutableBuffer,
68 null_buffer: Option<MutableBuffer>,
69 ) -> Self {
70 let offsets_builder = BufferBuilder::<T::Offset>::new_from_buffer(offsets_buffer);
71 let value_builder = BufferBuilder::<u8>::new_from_buffer(value_buffer);
72
73 let null_buffer_builder = null_buffer
74 .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
75 .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
76
77 Self {
78 offsets_builder,
79 value_builder,
80 null_buffer_builder,
81 }
82 }
83
84 #[inline]
85 fn next_offset(&self) -> T::Offset {
86 T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
87 }
88
89 #[inline]
105 pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
106 self.value_builder.append_slice(value.as_ref().as_ref());
107 self.null_buffer_builder.append(true);
108 self.offsets_builder.append(self.next_offset());
109 }
110
111 #[inline]
118 pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
119 match value {
120 None => self.append_null(),
121 Some(v) => self.append_value(v),
122 };
123 }
124
125 #[inline]
127 pub fn append_null(&mut self) {
128 self.null_buffer_builder.append(false);
129 self.offsets_builder.append(self.next_offset());
130 }
131
132 #[inline]
135 pub fn append_array(&mut self, array: &GenericByteArray<T>) {
136 if array.len() == 0 {
137 return;
138 }
139
140 let offsets = array.offsets();
141
142 if self.next_offset() == offsets[0] {
145 self.offsets_builder.append_slice(&offsets[1..]);
146 } else {
147 let shift: T::Offset = self.next_offset() - offsets[0];
149
150 let mut intermediate = Vec::with_capacity(offsets.len() - 1);
154
155 for &offset in &offsets[1..] {
156 intermediate.push(offset + shift)
157 }
158
159 self.offsets_builder.append_slice(&intermediate);
160 }
161
162 self.value_builder.append_slice(
164 &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
165 );
166
167 if let Some(null_buffer) = array.nulls() {
168 self.null_buffer_builder.append_buffer(null_buffer);
169 } else {
170 self.null_buffer_builder.append_n_non_nulls(array.len());
171 }
172 }
173
174 pub fn finish(&mut self) -> GenericByteArray<T> {
176 let array_type = T::DATA_TYPE;
177 let array_builder = ArrayDataBuilder::new(array_type)
178 .len(self.len())
179 .add_buffer(self.offsets_builder.finish())
180 .add_buffer(self.value_builder.finish())
181 .nulls(self.null_buffer_builder.finish());
182
183 self.offsets_builder.append(self.next_offset());
184 let array_data = unsafe { array_builder.build_unchecked() };
185 GenericByteArray::from(array_data)
186 }
187
188 pub fn finish_cloned(&self) -> GenericByteArray<T> {
190 let array_type = T::DATA_TYPE;
191 let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
192 let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
193 let array_builder = ArrayDataBuilder::new(array_type)
194 .len(self.len())
195 .add_buffer(offset_buffer)
196 .add_buffer(value_buffer)
197 .nulls(self.null_buffer_builder.finish_cloned());
198
199 let array_data = unsafe { array_builder.build_unchecked() };
200 GenericByteArray::from(array_data)
201 }
202
203 pub fn values_slice(&self) -> &[u8] {
205 self.value_builder.as_slice()
206 }
207
208 pub fn offsets_slice(&self) -> &[T::Offset] {
210 self.offsets_builder.as_slice()
211 }
212
213 pub fn validity_slice(&self) -> Option<&[u8]> {
215 self.null_buffer_builder.as_slice()
216 }
217
218 pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
220 self.null_buffer_builder.as_slice_mut()
221 }
222}
223
224impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
225 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
226 write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
227 f.debug_struct("")
228 .field("value_builder", &self.value_builder)
229 .field("offsets_builder", &self.offsets_builder)
230 .field("null_buffer_builder", &self.null_buffer_builder)
231 .finish()
232 }
233}
234
235impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
236 fn default() -> Self {
237 Self::new()
238 }
239}
240
241impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
242 fn len(&self) -> usize {
244 self.null_buffer_builder.len()
245 }
246
247 fn finish(&mut self) -> ArrayRef {
249 Arc::new(self.finish())
250 }
251
252 fn finish_cloned(&self) -> ArrayRef {
254 Arc::new(self.finish_cloned())
255 }
256
257 fn as_any(&self) -> &dyn Any {
259 self
260 }
261
262 fn as_any_mut(&mut self) -> &mut dyn Any {
264 self
265 }
266
267 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
269 self
270 }
271}
272
273impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
274 #[inline]
275 fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
276 for v in iter {
277 self.append_option(v)
278 }
279 }
280}
281
282pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
332
333impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
334 fn write_str(&mut self, s: &str) -> std::fmt::Result {
335 self.value_builder.append_slice(s.as_bytes());
336 Ok(())
337 }
338}
339
340pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
386
387impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> {
388 fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> {
389 self.value_builder.append_slice(bs);
390 Ok(bs.len())
391 }
392
393 fn flush(&mut self) -> std::io::Result<()> {
394 Ok(())
395 }
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401 use crate::array::Array;
402 use crate::GenericStringArray;
403 use arrow_buffer::NullBuffer;
404 use std::fmt::Write as _;
405 use std::io::Write as _;
406
407 fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
408 let mut builder = GenericBinaryBuilder::<O>::new();
409
410 builder.append_value(b"hello");
411 builder.append_value(b"");
412 builder.append_null();
413 builder.append_value(b"rust");
414
415 let array = builder.finish();
416
417 assert_eq!(4, array.len());
418 assert_eq!(1, array.null_count());
419 assert_eq!(b"hello", array.value(0));
420 assert_eq!([] as [u8; 0], array.value(1));
421 assert!(array.is_null(2));
422 assert_eq!(b"rust", array.value(3));
423 assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
424 assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
425 }
426
427 #[test]
428 fn test_binary_builder() {
429 _test_generic_binary_builder::<i32>()
430 }
431
432 #[test]
433 fn test_large_binary_builder() {
434 _test_generic_binary_builder::<i64>()
435 }
436
437 fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
438 let mut builder = GenericBinaryBuilder::<O>::new();
439 builder.append_null();
440 builder.append_null();
441 builder.append_null();
442 assert_eq!(3, builder.len());
443 assert!(!builder.is_empty());
444
445 let array = builder.finish();
446 assert_eq!(3, array.null_count());
447 assert_eq!(3, array.len());
448 assert!(array.is_null(0));
449 assert!(array.is_null(1));
450 assert!(array.is_null(2));
451 }
452
453 #[test]
454 fn test_binary_builder_all_nulls() {
455 _test_generic_binary_builder_all_nulls::<i32>()
456 }
457
458 #[test]
459 fn test_large_binary_builder_all_nulls() {
460 _test_generic_binary_builder_all_nulls::<i64>()
461 }
462
463 fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
464 let mut builder = GenericBinaryBuilder::<O>::new();
465
466 builder.append_value(b"hello");
467 builder.append_value(b"");
468 builder.append_null();
469 builder.append_value(b"rust");
470 builder.finish();
471
472 assert!(builder.is_empty());
473
474 builder.append_value(b"parquet");
475 builder.append_null();
476 builder.append_value(b"arrow");
477 builder.append_value(b"");
478 let array = builder.finish();
479
480 assert_eq!(4, array.len());
481 assert_eq!(1, array.null_count());
482 assert_eq!(b"parquet", array.value(0));
483 assert!(array.is_null(1));
484 assert_eq!(b"arrow", array.value(2));
485 assert_eq!(b"", array.value(1));
486 assert_eq!(O::zero(), array.value_offsets()[0]);
487 assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
488 assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
489 }
490
491 #[test]
492 fn test_binary_builder_reset() {
493 _test_generic_binary_builder_reset::<i32>()
494 }
495
496 #[test]
497 fn test_large_binary_builder_reset() {
498 _test_generic_binary_builder_reset::<i64>()
499 }
500
501 fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
502 let mut builder = GenericStringBuilder::<O>::new();
503 let owned = "arrow".to_owned();
504
505 builder.append_value("hello");
506 builder.append_value("");
507 builder.append_value(&owned);
508 builder.append_null();
509 builder.append_option(Some("rust"));
510 builder.append_option(None::<&str>);
511 builder.append_option(None::<String>);
512 assert_eq!(7, builder.len());
513
514 assert_eq!(
515 GenericStringArray::<O>::from(vec![
516 Some("hello"),
517 Some(""),
518 Some("arrow"),
519 None,
520 Some("rust"),
521 None,
522 None
523 ]),
524 builder.finish()
525 );
526 }
527
528 #[test]
529 fn test_string_array_builder() {
530 _test_generic_string_array_builder::<i32>()
531 }
532
533 #[test]
534 fn test_large_string_array_builder() {
535 _test_generic_string_array_builder::<i64>()
536 }
537
538 fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
539 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
540
541 builder.append_value("hello");
542 builder.append_value("rust");
543 builder.append_null();
544
545 builder.finish();
546 assert!(builder.is_empty());
547 assert_eq!(&[O::zero()], builder.offsets_slice());
548
549 builder.append_value("arrow");
550 builder.append_value("parquet");
551 let arr = builder.finish();
552 assert!(arr.nulls().is_none());
554 assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
555 }
556
557 #[test]
558 fn test_string_array_builder_finish() {
559 _test_generic_string_array_builder_finish::<i32>()
560 }
561
562 #[test]
563 fn test_large_string_array_builder_finish() {
564 _test_generic_string_array_builder_finish::<i64>()
565 }
566
567 fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
568 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
569
570 builder.append_value("hello");
571 builder.append_value("rust");
572 builder.append_null();
573
574 let mut arr = builder.finish_cloned();
575 assert!(!builder.is_empty());
576 assert_eq!(3, arr.len());
577
578 builder.append_value("arrow");
579 builder.append_value("parquet");
580 arr = builder.finish();
581
582 assert!(arr.nulls().is_some());
583 assert_eq!(&[O::zero()], builder.offsets_slice());
584 assert_eq!(5, arr.len());
585 }
586
587 #[test]
588 fn test_string_array_builder_finish_cloned() {
589 _test_generic_string_array_builder_finish_cloned::<i32>()
590 }
591
592 #[test]
593 fn test_large_string_array_builder_finish_cloned() {
594 _test_generic_string_array_builder_finish_cloned::<i64>()
595 }
596
597 #[test]
598 fn test_extend() {
599 let mut builder = GenericStringBuilder::<i32>::new();
600 builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
601 builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
602 let array = builder.finish();
603 assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
604 assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
605 }
606
607 #[test]
608 fn test_write_str() {
609 let mut builder = GenericStringBuilder::<i32>::new();
610 write!(builder, "foo").unwrap();
611 builder.append_value("");
612 writeln!(builder, "bar").unwrap();
613 builder.append_value("");
614 write!(builder, "fiz").unwrap();
615 write!(builder, "buz").unwrap();
616 builder.append_value("");
617 let a = builder.finish();
618 let r: Vec<_> = a.iter().flatten().collect();
619 assert_eq!(r, &["foo", "bar\n", "fizbuz"])
620 }
621
622 #[test]
623 fn test_write_bytes() {
624 let mut builder = GenericBinaryBuilder::<i32>::new();
625 write!(builder, "foo").unwrap();
626 builder.append_value("");
627 writeln!(builder, "bar").unwrap();
628 builder.append_value("");
629 write!(builder, "fiz").unwrap();
630 write!(builder, "buz").unwrap();
631 builder.append_value("");
632 let a = builder.finish();
633 let r: Vec<_> = a.iter().flatten().collect();
634 assert_eq!(
635 r,
636 &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
637 )
638 }
639
640 #[test]
641 fn test_append_array_without_nulls() {
642 let input = vec![
643 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
644 "thank", "you", "for", "asking",
645 ];
646 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
647 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
648 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
649
650 let mut builder = GenericStringBuilder::<i32>::new();
651 builder.append_array(&arr1);
652 builder.append_array(&arr2);
653 builder.append_array(&arr3);
654
655 let actual = builder.finish();
656 let expected = GenericStringArray::<i32>::from(input);
657
658 assert_eq!(actual, expected);
659 }
660
661 #[test]
662 fn test_append_array_with_nulls() {
663 let input = vec![
664 Some("hello"),
665 None,
666 Some("how"),
667 None,
668 None,
669 None,
670 None,
671 Some("I"),
672 Some("am"),
673 Some("doing"),
674 Some("well"),
675 ];
676 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
677 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
678 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
679
680 let mut builder = GenericStringBuilder::<i32>::new();
681 builder.append_array(&arr1);
682 builder.append_array(&arr2);
683 builder.append_array(&arr3);
684
685 let actual = builder.finish();
686 let expected = GenericStringArray::<i32>::from(input);
687
688 assert_eq!(actual, expected);
689 }
690
691 #[test]
692 fn test_append_empty_array() {
693 let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
694 let mut builder = GenericStringBuilder::<i32>::new();
695 builder.append_array(&arr);
696 let result = builder.finish();
697 assert_eq!(result.len(), 0);
698 }
699
700 #[test]
701 fn test_append_array_with_offset_not_starting_at_0() {
702 let input = vec![
703 Some("hello"),
704 None,
705 Some("how"),
706 None,
707 None,
708 None,
709 None,
710 Some("I"),
711 Some("am"),
712 Some("doing"),
713 Some("well"),
714 ];
715 let full_array = GenericStringArray::<i32>::from(input);
716 let sliced = full_array.slice(1, 4);
717
718 assert_ne!(sliced.offsets()[0].as_usize(), 0);
719 assert_ne!(sliced.offsets().last(), full_array.offsets().last());
720
721 let mut builder = GenericStringBuilder::<i32>::new();
722 builder.append_array(&sliced);
723 let actual = builder.finish();
724
725 let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);
726
727 assert_eq!(actual, expected);
728 }
729
730 #[test]
731 fn test_append_underlying_null_values_added_as_is() {
732 let input_1_array_with_nulls = {
733 let input = vec![
734 "hello", "world", "how", "are", "you", "doing", "today", "I", "am",
735 ];
736 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
737
738 GenericStringArray::<i32>::new(
739 offsets,
740 buffer,
741 Some(NullBuffer::from(&[
742 true, false, true, false, false, true, true, true, false,
743 ])),
744 )
745 };
746 let input_2_array_with_nulls = {
747 let input = vec!["doing", "well", "thank", "you", "for", "asking"];
748 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
749
750 GenericStringArray::<i32>::new(
751 offsets,
752 buffer,
753 Some(NullBuffer::from(&[false, false, true, false, true, true])),
754 )
755 };
756
757 let mut builder = GenericStringBuilder::<i32>::new();
758 builder.append_array(&input_1_array_with_nulls);
759 builder.append_array(&input_2_array_with_nulls);
760
761 let actual = builder.finish();
762 let expected = GenericStringArray::<i32>::from(vec![
763 Some("hello"),
764 None, Some("how"),
766 None, None, Some("doing"),
769 Some("today"),
770 Some("I"),
771 None, None, None, Some("thank"),
775 None, Some("for"),
777 Some("asking"),
778 ]);
779
780 assert_eq!(actual, expected);
781
782 let expected_underlying_buffer = Buffer::from(
783 [
784 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
785 "well", "thank", "you", "for", "asking",
786 ]
787 .join("")
788 .as_bytes(),
789 );
790 assert_eq!(actual.values(), &expected_underlying_buffer);
791 }
792
793 #[test]
794 fn append_array_with_continues_indices() {
795 let input = vec![
796 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
797 "thank", "you", "for", "asking",
798 ];
799 let full_array = GenericStringArray::<i32>::from(input);
800 let slice1 = full_array.slice(0, 3);
801 let slice2 = full_array.slice(3, 4);
802 let slice3 = full_array.slice(7, full_array.len() - 7);
803
804 let mut builder = GenericStringBuilder::<i32>::new();
805 builder.append_array(&slice1);
806 builder.append_array(&slice2);
807 builder.append_array(&slice3);
808
809 let actual = builder.finish();
810
811 assert_eq!(actual, full_array);
812 }
813}