1use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
19use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
21use arrow_buffer::NullBufferBuilder;
22use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
23use arrow_data::ArrayDataBuilder;
24use std::any::Any;
25use std::sync::Arc;
26
27pub struct GenericByteBuilder<T: ByteArrayType> {
32 value_builder: UInt8BufferBuilder,
33 offsets_builder: BufferBuilder<T::Offset>,
34 null_buffer_builder: NullBufferBuilder,
35}
36
37impl<T: ByteArrayType> GenericByteBuilder<T> {
38 pub fn new() -> Self {
40 Self::with_capacity(1024, 1024)
41 }
42
43 pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
50 let mut offsets_builder = BufferBuilder::<T::Offset>::new(item_capacity + 1);
51 offsets_builder.append(T::Offset::from_usize(0).unwrap());
52 Self {
53 value_builder: UInt8BufferBuilder::new(data_capacity),
54 offsets_builder,
55 null_buffer_builder: NullBufferBuilder::new(item_capacity),
56 }
57 }
58
59 pub unsafe fn new_from_buffer(
66 offsets_buffer: MutableBuffer,
67 value_buffer: MutableBuffer,
68 null_buffer: Option<MutableBuffer>,
69 ) -> Self {
70 let offsets_builder = BufferBuilder::<T::Offset>::new_from_buffer(offsets_buffer);
71 let value_builder = BufferBuilder::<u8>::new_from_buffer(value_buffer);
72
73 let null_buffer_builder = null_buffer
74 .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
75 .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
76
77 Self {
78 offsets_builder,
79 value_builder,
80 null_buffer_builder,
81 }
82 }
83
84 #[inline]
85 fn next_offset(&self) -> T::Offset {
86 T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
87 }
88
89 #[inline]
105 pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
106 self.value_builder.append_slice(value.as_ref().as_ref());
107 self.null_buffer_builder.append(true);
108 self.offsets_builder.append(self.next_offset());
109 }
110
111 #[inline]
118 pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
119 match value {
120 None => self.append_null(),
121 Some(v) => self.append_value(v),
122 };
123 }
124
125 #[inline]
127 pub fn append_null(&mut self) {
128 self.null_buffer_builder.append(false);
129 self.offsets_builder.append(self.next_offset());
130 }
131
132 #[inline]
134 pub fn append_nulls(&mut self, n: usize) {
135 self.null_buffer_builder.append_n_nulls(n);
136 let next_offset = self.next_offset();
137 self.offsets_builder.append_n(n, next_offset);
138 }
139
140 #[inline]
143 pub fn append_array(&mut self, array: &GenericByteArray<T>) {
144 if array.len() == 0 {
145 return;
146 }
147
148 let offsets = array.offsets();
149
150 if self.next_offset() == offsets[0] {
153 self.offsets_builder.append_slice(&offsets[1..]);
154 } else {
155 let shift: T::Offset = self.next_offset() - offsets[0];
157
158 let mut intermediate = Vec::with_capacity(offsets.len() - 1);
162
163 for &offset in &offsets[1..] {
164 intermediate.push(offset + shift)
165 }
166
167 self.offsets_builder.append_slice(&intermediate);
168 }
169
170 self.value_builder.append_slice(
172 &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
173 );
174
175 if let Some(null_buffer) = array.nulls() {
176 self.null_buffer_builder.append_buffer(null_buffer);
177 } else {
178 self.null_buffer_builder.append_n_non_nulls(array.len());
179 }
180 }
181
182 pub fn finish(&mut self) -> GenericByteArray<T> {
184 let array_type = T::DATA_TYPE;
185 let array_builder = ArrayDataBuilder::new(array_type)
186 .len(self.len())
187 .add_buffer(self.offsets_builder.finish())
188 .add_buffer(self.value_builder.finish())
189 .nulls(self.null_buffer_builder.finish());
190
191 self.offsets_builder.append(self.next_offset());
192 let array_data = unsafe { array_builder.build_unchecked() };
193 GenericByteArray::from(array_data)
194 }
195
196 pub fn finish_cloned(&self) -> GenericByteArray<T> {
198 let array_type = T::DATA_TYPE;
199 let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
200 let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
201 let array_builder = ArrayDataBuilder::new(array_type)
202 .len(self.len())
203 .add_buffer(offset_buffer)
204 .add_buffer(value_buffer)
205 .nulls(self.null_buffer_builder.finish_cloned());
206
207 let array_data = unsafe { array_builder.build_unchecked() };
208 GenericByteArray::from(array_data)
209 }
210
211 pub fn values_slice(&self) -> &[u8] {
213 self.value_builder.as_slice()
214 }
215
216 pub fn offsets_slice(&self) -> &[T::Offset] {
218 self.offsets_builder.as_slice()
219 }
220
221 pub fn validity_slice(&self) -> Option<&[u8]> {
223 self.null_buffer_builder.as_slice()
224 }
225
226 pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
228 self.null_buffer_builder.as_slice_mut()
229 }
230}
231
232impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
233 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
234 write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
235 f.debug_struct("")
236 .field("value_builder", &self.value_builder)
237 .field("offsets_builder", &self.offsets_builder)
238 .field("null_buffer_builder", &self.null_buffer_builder)
239 .finish()
240 }
241}
242
243impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
244 fn default() -> Self {
245 Self::new()
246 }
247}
248
249impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
250 fn len(&self) -> usize {
252 self.null_buffer_builder.len()
253 }
254
255 fn finish(&mut self) -> ArrayRef {
257 Arc::new(self.finish())
258 }
259
260 fn finish_cloned(&self) -> ArrayRef {
262 Arc::new(self.finish_cloned())
263 }
264
265 fn as_any(&self) -> &dyn Any {
267 self
268 }
269
270 fn as_any_mut(&mut self) -> &mut dyn Any {
272 self
273 }
274
275 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
277 self
278 }
279}
280
281impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
282 #[inline]
283 fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
284 for v in iter {
285 self.append_option(v)
286 }
287 }
288}
289
290pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
340
341impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
342 fn write_str(&mut self, s: &str) -> std::fmt::Result {
343 self.value_builder.append_slice(s.as_bytes());
344 Ok(())
345 }
346}
347
348pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
394
395impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> {
396 fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> {
397 self.value_builder.append_slice(bs);
398 Ok(bs.len())
399 }
400
401 fn flush(&mut self) -> std::io::Result<()> {
402 Ok(())
403 }
404}
405
406#[cfg(test)]
407mod tests {
408 use super::*;
409 use crate::array::Array;
410 use crate::GenericStringArray;
411 use arrow_buffer::NullBuffer;
412 use std::fmt::Write as _;
413 use std::io::Write as _;
414
415 fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
416 let mut builder = GenericBinaryBuilder::<O>::new();
417
418 builder.append_value(b"hello");
419 builder.append_value(b"");
420 builder.append_null();
421 builder.append_value(b"rust");
422
423 let array = builder.finish();
424
425 assert_eq!(4, array.len());
426 assert_eq!(1, array.null_count());
427 assert_eq!(b"hello", array.value(0));
428 assert_eq!([] as [u8; 0], array.value(1));
429 assert!(array.is_null(2));
430 assert_eq!(b"rust", array.value(3));
431 assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
432 assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
433 }
434
435 #[test]
436 fn test_binary_builder() {
437 _test_generic_binary_builder::<i32>()
438 }
439
440 #[test]
441 fn test_large_binary_builder() {
442 _test_generic_binary_builder::<i64>()
443 }
444
445 fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
446 let mut builder = GenericBinaryBuilder::<O>::new();
447 builder.append_null();
448 builder.append_null();
449 builder.append_null();
450 builder.append_nulls(2);
451 assert_eq!(5, builder.len());
452 assert!(!builder.is_empty());
453
454 let array = builder.finish();
455 assert_eq!(5, array.null_count());
456 assert_eq!(5, array.len());
457 assert!(array.is_null(0));
458 assert!(array.is_null(1));
459 assert!(array.is_null(2));
460 assert!(array.is_null(3));
461 assert!(array.is_null(4));
462 }
463
464 #[test]
465 fn test_binary_builder_all_nulls() {
466 _test_generic_binary_builder_all_nulls::<i32>()
467 }
468
469 #[test]
470 fn test_large_binary_builder_all_nulls() {
471 _test_generic_binary_builder_all_nulls::<i64>()
472 }
473
474 fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
475 let mut builder = GenericBinaryBuilder::<O>::new();
476
477 builder.append_value(b"hello");
478 builder.append_value(b"");
479 builder.append_null();
480 builder.append_value(b"rust");
481 builder.finish();
482
483 assert!(builder.is_empty());
484
485 builder.append_value(b"parquet");
486 builder.append_null();
487 builder.append_value(b"arrow");
488 builder.append_value(b"");
489 builder.append_nulls(2);
490 builder.append_value(b"hi");
491 let array = builder.finish();
492
493 assert_eq!(7, array.len());
494 assert_eq!(3, array.null_count());
495 assert_eq!(b"parquet", array.value(0));
496 assert!(array.is_null(1));
497 assert!(array.is_null(4));
498 assert!(array.is_null(5));
499 assert_eq!(b"arrow", array.value(2));
500 assert_eq!(b"", array.value(1));
501 assert_eq!(b"hi", array.value(6));
502
503 assert_eq!(O::zero(), array.value_offsets()[0]);
504 assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
505 assert_eq!(O::from_usize(14).unwrap(), array.value_offsets()[7]);
506 assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
507 }
508
509 #[test]
510 fn test_binary_builder_reset() {
511 _test_generic_binary_builder_reset::<i32>()
512 }
513
514 #[test]
515 fn test_large_binary_builder_reset() {
516 _test_generic_binary_builder_reset::<i64>()
517 }
518
519 fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
520 let mut builder = GenericStringBuilder::<O>::new();
521 let owned = "arrow".to_owned();
522
523 builder.append_value("hello");
524 builder.append_value("");
525 builder.append_value(&owned);
526 builder.append_null();
527 builder.append_option(Some("rust"));
528 builder.append_option(None::<&str>);
529 builder.append_option(None::<String>);
530 builder.append_nulls(2);
531 builder.append_value("parquet");
532 assert_eq!(10, builder.len());
533
534 assert_eq!(
535 GenericStringArray::<O>::from(vec![
536 Some("hello"),
537 Some(""),
538 Some("arrow"),
539 None,
540 Some("rust"),
541 None,
542 None,
543 None,
544 None,
545 Some("parquet")
546 ]),
547 builder.finish()
548 );
549 }
550
551 #[test]
552 fn test_string_array_builder() {
553 _test_generic_string_array_builder::<i32>()
554 }
555
556 #[test]
557 fn test_large_string_array_builder() {
558 _test_generic_string_array_builder::<i64>()
559 }
560
561 fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
562 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
563
564 builder.append_value("hello");
565 builder.append_value("rust");
566 builder.append_null();
567
568 builder.finish();
569 assert!(builder.is_empty());
570 assert_eq!(&[O::zero()], builder.offsets_slice());
571
572 builder.append_value("arrow");
573 builder.append_value("parquet");
574 let arr = builder.finish();
575 assert!(arr.nulls().is_none());
577 assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
578 }
579
580 #[test]
581 fn test_string_array_builder_finish() {
582 _test_generic_string_array_builder_finish::<i32>()
583 }
584
585 #[test]
586 fn test_large_string_array_builder_finish() {
587 _test_generic_string_array_builder_finish::<i64>()
588 }
589
590 fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
591 let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
592
593 builder.append_value("hello");
594 builder.append_value("rust");
595 builder.append_null();
596
597 let mut arr = builder.finish_cloned();
598 assert!(!builder.is_empty());
599 assert_eq!(3, arr.len());
600
601 builder.append_value("arrow");
602 builder.append_value("parquet");
603 arr = builder.finish();
604
605 assert!(arr.nulls().is_some());
606 assert_eq!(&[O::zero()], builder.offsets_slice());
607 assert_eq!(5, arr.len());
608 }
609
610 #[test]
611 fn test_string_array_builder_finish_cloned() {
612 _test_generic_string_array_builder_finish_cloned::<i32>()
613 }
614
615 #[test]
616 fn test_large_string_array_builder_finish_cloned() {
617 _test_generic_string_array_builder_finish_cloned::<i64>()
618 }
619
620 #[test]
621 fn test_extend() {
622 let mut builder = GenericStringBuilder::<i32>::new();
623 builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
624 builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
625 let array = builder.finish();
626 assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
627 assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
628 }
629
630 #[test]
631 fn test_write_str() {
632 let mut builder = GenericStringBuilder::<i32>::new();
633 write!(builder, "foo").unwrap();
634 builder.append_value("");
635 writeln!(builder, "bar").unwrap();
636 builder.append_value("");
637 write!(builder, "fiz").unwrap();
638 write!(builder, "buz").unwrap();
639 builder.append_value("");
640 let a = builder.finish();
641 let r: Vec<_> = a.iter().flatten().collect();
642 assert_eq!(r, &["foo", "bar\n", "fizbuz"])
643 }
644
645 #[test]
646 fn test_write_bytes() {
647 let mut builder = GenericBinaryBuilder::<i32>::new();
648 write!(builder, "foo").unwrap();
649 builder.append_value("");
650 writeln!(builder, "bar").unwrap();
651 builder.append_value("");
652 write!(builder, "fiz").unwrap();
653 write!(builder, "buz").unwrap();
654 builder.append_value("");
655 let a = builder.finish();
656 let r: Vec<_> = a.iter().flatten().collect();
657 assert_eq!(
658 r,
659 &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
660 )
661 }
662
663 #[test]
664 fn test_append_array_without_nulls() {
665 let input = vec![
666 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
667 "thank", "you", "for", "asking",
668 ];
669 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
670 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
671 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
672
673 let mut builder = GenericStringBuilder::<i32>::new();
674 builder.append_array(&arr1);
675 builder.append_array(&arr2);
676 builder.append_array(&arr3);
677
678 let actual = builder.finish();
679 let expected = GenericStringArray::<i32>::from(input);
680
681 assert_eq!(actual, expected);
682 }
683
684 #[test]
685 fn test_append_array_with_nulls() {
686 let input = vec![
687 Some("hello"),
688 None,
689 Some("how"),
690 None,
691 None,
692 None,
693 None,
694 Some("I"),
695 Some("am"),
696 Some("doing"),
697 Some("well"),
698 ];
699 let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
700 let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
701 let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
702
703 let mut builder = GenericStringBuilder::<i32>::new();
704 builder.append_array(&arr1);
705 builder.append_array(&arr2);
706 builder.append_array(&arr3);
707
708 let actual = builder.finish();
709 let expected = GenericStringArray::<i32>::from(input);
710
711 assert_eq!(actual, expected);
712 }
713
714 #[test]
715 fn test_append_empty_array() {
716 let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
717 let mut builder = GenericStringBuilder::<i32>::new();
718 builder.append_array(&arr);
719 let result = builder.finish();
720 assert_eq!(result.len(), 0);
721 }
722
723 #[test]
724 fn test_append_array_with_offset_not_starting_at_0() {
725 let input = vec![
726 Some("hello"),
727 None,
728 Some("how"),
729 None,
730 None,
731 None,
732 None,
733 Some("I"),
734 Some("am"),
735 Some("doing"),
736 Some("well"),
737 ];
738 let full_array = GenericStringArray::<i32>::from(input);
739 let sliced = full_array.slice(1, 4);
740
741 assert_ne!(sliced.offsets()[0].as_usize(), 0);
742 assert_ne!(sliced.offsets().last(), full_array.offsets().last());
743
744 let mut builder = GenericStringBuilder::<i32>::new();
745 builder.append_array(&sliced);
746 let actual = builder.finish();
747
748 let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);
749
750 assert_eq!(actual, expected);
751 }
752
753 #[test]
754 fn test_append_underlying_null_values_added_as_is() {
755 let input_1_array_with_nulls = {
756 let input = vec![
757 "hello", "world", "how", "are", "you", "doing", "today", "I", "am",
758 ];
759 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
760
761 GenericStringArray::<i32>::new(
762 offsets,
763 buffer,
764 Some(NullBuffer::from(&[
765 true, false, true, false, false, true, true, true, false,
766 ])),
767 )
768 };
769 let input_2_array_with_nulls = {
770 let input = vec!["doing", "well", "thank", "you", "for", "asking"];
771 let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
772
773 GenericStringArray::<i32>::new(
774 offsets,
775 buffer,
776 Some(NullBuffer::from(&[false, false, true, false, true, true])),
777 )
778 };
779
780 let mut builder = GenericStringBuilder::<i32>::new();
781 builder.append_array(&input_1_array_with_nulls);
782 builder.append_array(&input_2_array_with_nulls);
783
784 let actual = builder.finish();
785 let expected = GenericStringArray::<i32>::from(vec![
786 Some("hello"),
787 None, Some("how"),
789 None, None, Some("doing"),
792 Some("today"),
793 Some("I"),
794 None, None, None, Some("thank"),
798 None, Some("for"),
800 Some("asking"),
801 ]);
802
803 assert_eq!(actual, expected);
804
805 let expected_underlying_buffer = Buffer::from(
806 [
807 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
808 "well", "thank", "you", "for", "asking",
809 ]
810 .join("")
811 .as_bytes(),
812 );
813 assert_eq!(actual.values(), &expected_underlying_buffer);
814 }
815
816 #[test]
817 fn append_array_with_continues_indices() {
818 let input = vec![
819 "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
820 "thank", "you", "for", "asking",
821 ];
822 let full_array = GenericStringArray::<i32>::from(input);
823 let slice1 = full_array.slice(0, 3);
824 let slice2 = full_array.slice(3, 4);
825 let slice3 = full_array.slice(7, full_array.len() - 7);
826
827 let mut builder = GenericStringBuilder::<i32>::new();
828 builder.append_array(&slice1);
829 builder.append_array(&slice2);
830 builder.append_array(&slice3);
831
832 let actual = builder.finish();
833
834 assert_eq!(actual, full_array);
835 }
836}