1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal128(_, _)
87 | DataType::Decimal256(_, _)
88 | DataType::Date32
89 | DataType::Time32(_)
90 | DataType::Date64
91 | DataType::Time64(_)
92 | DataType::Duration(_)
93 | DataType::Timestamp(_, _)
94 | DataType::Interval(_) => [
95 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
96 empty_buffer,
97 ],
98 DataType::Utf8 | DataType::Binary => {
99 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
100 buffer.push(0i32);
102 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
103 }
104 DataType::LargeUtf8 | DataType::LargeBinary => {
105 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
106 buffer.push(0i64);
108 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
109 }
110 DataType::BinaryView | DataType::Utf8View => [
111 MutableBuffer::new(capacity * mem::size_of::<u128>()),
112 empty_buffer,
113 ],
114 DataType::List(_) | DataType::Map(_, _) => {
115 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
117 buffer.push(0i32);
118 [buffer, empty_buffer]
119 }
120 DataType::ListView(_) => [
121 MutableBuffer::new(capacity * mem::size_of::<i32>()),
122 MutableBuffer::new(capacity * mem::size_of::<i32>()),
123 ],
124 DataType::LargeList(_) => {
125 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
127 buffer.push(0i64);
128 [buffer, empty_buffer]
129 }
130 DataType::LargeListView(_) => [
131 MutableBuffer::new(capacity * mem::size_of::<i64>()),
132 MutableBuffer::new(capacity * mem::size_of::<i64>()),
133 ],
134 DataType::FixedSizeBinary(size) => {
135 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
136 }
137 DataType::Dictionary(k, _) => [
138 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
139 empty_buffer,
140 ],
141 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
142 [empty_buffer, MutableBuffer::new(0)]
143 }
144 DataType::Union(_, mode) => {
145 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
146 match mode {
147 UnionMode::Sparse => [type_ids, empty_buffer],
148 UnionMode::Dense => {
149 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
150 [type_ids, offsets]
151 }
152 }
153 }
154 }
155}
156
157#[derive(Debug, Clone)]
203pub struct ArrayData {
204 data_type: DataType,
206
207 len: usize,
209
210 offset: usize,
212
213 buffers: Vec<Buffer>,
217
218 child_data: Vec<ArrayData>,
221
222 nulls: Option<NullBuffer>,
225}
226
227pub type ArrayDataRef = Arc<ArrayData>;
229
230impl ArrayData {
231 pub unsafe fn new_unchecked(
248 data_type: DataType,
249 len: usize,
250 null_count: Option<usize>,
251 null_bit_buffer: Option<Buffer>,
252 offset: usize,
253 buffers: Vec<Buffer>,
254 child_data: Vec<ArrayData>,
255 ) -> Self {
256 let mut skip_validation = UnsafeFlag::new();
257 skip_validation.set(true);
259
260 ArrayDataBuilder {
261 data_type,
262 len,
263 null_count,
264 null_bit_buffer,
265 nulls: None,
266 offset,
267 buffers,
268 child_data,
269 align_buffers: false,
270 skip_validation,
271 }
272 .build()
273 .unwrap()
274 }
275
276 pub fn try_new(
287 data_type: DataType,
288 len: usize,
289 null_bit_buffer: Option<Buffer>,
290 offset: usize,
291 buffers: Vec<Buffer>,
292 child_data: Vec<ArrayData>,
293 ) -> Result<Self, ArrowError> {
294 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
298 let needed_len = bit_util::ceil(len + offset, 8);
299 if null_bit_buffer.len() < needed_len {
300 return Err(ArrowError::InvalidArgumentError(format!(
301 "null_bit_buffer size too small. got {} needed {}",
302 null_bit_buffer.len(),
303 needed_len
304 )));
305 }
306 }
307 let new_self = unsafe {
309 Self::new_unchecked(
310 data_type,
311 len,
312 None,
313 null_bit_buffer,
314 offset,
315 buffers,
316 child_data,
317 )
318 };
319
320 new_self.validate_data()?;
325 Ok(new_self)
326 }
327
328 #[inline]
330 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
331 ArrayDataBuilder::new(data_type)
332 }
333
334 #[inline]
336 pub const fn data_type(&self) -> &DataType {
337 &self.data_type
338 }
339
340 pub fn buffers(&self) -> &[Buffer] {
342 &self.buffers
343 }
344
345 pub fn child_data(&self) -> &[ArrayData] {
348 &self.child_data[..]
349 }
350
351 #[inline]
353 pub fn is_null(&self, i: usize) -> bool {
354 match &self.nulls {
355 Some(v) => v.is_null(i),
356 None => false,
357 }
358 }
359
360 #[inline]
364 pub fn nulls(&self) -> Option<&NullBuffer> {
365 self.nulls.as_ref()
366 }
367
368 #[inline]
370 pub fn is_valid(&self, i: usize) -> bool {
371 !self.is_null(i)
372 }
373
374 #[inline]
376 pub const fn len(&self) -> usize {
377 self.len
378 }
379
380 #[inline]
382 pub const fn is_empty(&self) -> bool {
383 self.len == 0
384 }
385
386 #[inline]
388 pub const fn offset(&self) -> usize {
389 self.offset
390 }
391
392 #[inline]
394 pub fn null_count(&self) -> usize {
395 self.nulls
396 .as_ref()
397 .map(|x| x.null_count())
398 .unwrap_or_default()
399 }
400
401 pub fn get_buffer_memory_size(&self) -> usize {
413 let mut size = 0;
414 for buffer in &self.buffers {
415 size += buffer.capacity();
416 }
417 if let Some(bitmap) = &self.nulls {
418 size += bitmap.buffer().capacity()
419 }
420 for child in &self.child_data {
421 size += child.get_buffer_memory_size();
422 }
423 size
424 }
425
426 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
439 let mut result: usize = 0;
440 let layout = layout(&self.data_type);
441
442 for spec in layout.buffers.iter() {
443 match spec {
444 BufferSpec::FixedWidth { byte_width, .. } => {
445 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
446 ArrowError::ComputeError(
447 "Integer overflow computing buffer size".to_string(),
448 )
449 })?;
450 result += buffer_size;
451 }
452 BufferSpec::VariableWidth => {
453 let buffer_len: usize;
454 match self.data_type {
455 DataType::Utf8 | DataType::Binary => {
456 let offsets = self.typed_offsets::<i32>()?;
457 buffer_len = (offsets[self.len] - offsets[0] ) as usize;
458 }
459 DataType::LargeUtf8 | DataType::LargeBinary => {
460 let offsets = self.typed_offsets::<i64>()?;
461 buffer_len = (offsets[self.len] - offsets[0]) as usize;
462 }
463 _ => {
464 return Err(ArrowError::NotYetImplemented(format!(
465 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
466 self.data_type
467 )))
468 }
469 };
470 result += buffer_len;
471 }
472 BufferSpec::BitMap => {
473 let buffer_size = bit_util::ceil(self.len, 8);
474 result += buffer_size;
475 }
476 BufferSpec::AlwaysNull => {
477 }
479 }
480 }
481
482 if self.nulls().is_some() {
483 result += bit_util::ceil(self.len, 8);
484 }
485
486 for child in &self.child_data {
487 result += child.get_slice_memory_size()?;
488 }
489 Ok(result)
490 }
491
492 pub fn get_array_memory_size(&self) -> usize {
501 let mut size = mem::size_of_val(self);
502
503 for buffer in &self.buffers {
505 size += mem::size_of::<Buffer>();
506 size += buffer.capacity();
507 }
508 if let Some(nulls) = &self.nulls {
509 size += nulls.buffer().capacity();
510 }
511 for child in &self.child_data {
512 size += child.get_array_memory_size();
513 }
514
515 size
516 }
517
518 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
526 assert!((offset + length) <= self.len());
527
528 if let DataType::Struct(_) = self.data_type() {
529 let new_offset = self.offset + offset;
531 let new_data = ArrayData {
532 data_type: self.data_type().clone(),
533 len: length,
534 offset: new_offset,
535 buffers: self.buffers.clone(),
536 child_data: self
538 .child_data()
539 .iter()
540 .map(|data| data.slice(offset, length))
541 .collect(),
542 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
543 };
544
545 new_data
546 } else {
547 let mut new_data = self.clone();
548
549 new_data.len = length;
550 new_data.offset = offset + self.offset;
551 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
552
553 new_data
554 }
555 }
556
557 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
563 &self.buffers()[buffer].typed_data()[self.offset..]
564 }
565
566 pub fn new_null(data_type: &DataType, len: usize) -> Self {
568 let bit_len = bit_util::ceil(len, 8);
569 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
570
571 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
572 Some(width) => (vec![zeroed(width * len)], vec![], true),
573 None => match data_type {
574 DataType::Null => (vec![], vec![], false),
575 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
576 DataType::Binary | DataType::Utf8 => {
577 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
578 }
579 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
580 DataType::LargeBinary | DataType::LargeUtf8 => {
581 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
582 }
583 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
584 DataType::List(f) | DataType::Map(f, _) => (
585 vec![zeroed((len + 1) * 4)],
586 vec![ArrayData::new_empty(f.data_type())],
587 true,
588 ),
589 DataType::LargeList(f) => (
590 vec![zeroed((len + 1) * 8)],
591 vec![ArrayData::new_empty(f.data_type())],
592 true,
593 ),
594 DataType::FixedSizeList(f, list_len) => (
595 vec![],
596 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
597 true,
598 ),
599 DataType::Struct(fields) => (
600 vec![],
601 fields
602 .iter()
603 .map(|f| Self::new_null(f.data_type(), len))
604 .collect(),
605 true,
606 ),
607 DataType::Dictionary(k, v) => (
608 vec![zeroed(k.primitive_width().unwrap() * len)],
609 vec![ArrayData::new_empty(v.as_ref())],
610 true,
611 ),
612 DataType::Union(f, mode) => {
613 let (id, _) = f.iter().next().unwrap();
614 let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
615 let buffers = match mode {
616 UnionMode::Sparse => vec![ids],
617 UnionMode::Dense => {
618 let end_offset = i32::from_usize(len).unwrap();
619 vec![ids, Buffer::from_iter(0_i32..end_offset)]
620 }
621 };
622
623 let children = f
624 .iter()
625 .enumerate()
626 .map(|(idx, (_, f))| {
627 if idx == 0 || *mode == UnionMode::Sparse {
628 Self::new_null(f.data_type(), len)
629 } else {
630 Self::new_empty(f.data_type())
631 }
632 })
633 .collect();
634
635 (buffers, children, false)
636 }
637 DataType::RunEndEncoded(r, v) => {
638 let runs = match r.data_type() {
639 DataType::Int16 => {
640 let i = i16::from_usize(len).expect("run overflow");
641 Buffer::from_slice_ref([i])
642 }
643 DataType::Int32 => {
644 let i = i32::from_usize(len).expect("run overflow");
645 Buffer::from_slice_ref([i])
646 }
647 DataType::Int64 => {
648 let i = i64::from_usize(len).expect("run overflow");
649 Buffer::from_slice_ref([i])
650 }
651 dt => unreachable!("Invalid run ends data type {dt}"),
652 };
653
654 let builder = ArrayData::builder(r.data_type().clone())
655 .len(1)
656 .buffers(vec![runs]);
657
658 let runs = unsafe { builder.build_unchecked() };
661 (
662 vec![],
663 vec![runs, ArrayData::new_null(v.data_type(), 1)],
664 false,
665 )
666 }
667 d => unreachable!("{d}"),
668 },
669 };
670
671 let mut builder = ArrayDataBuilder::new(data_type.clone())
672 .len(len)
673 .buffers(buffers)
674 .child_data(child_data);
675
676 if has_nulls {
677 builder = builder.nulls(Some(NullBuffer::new_null(len)))
678 }
679
680 unsafe { builder.build_unchecked() }
683 }
684
685 pub fn new_empty(data_type: &DataType) -> Self {
687 Self::new_null(data_type, 0)
688 }
689
690 pub fn align_buffers(&mut self) {
699 let layout = layout(&self.data_type);
700 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
701 if let BufferSpec::FixedWidth { alignment, .. } = spec {
702 if buffer.as_ptr().align_offset(*alignment) != 0 {
703 *buffer = Buffer::from_slice_ref(buffer.as_ref());
704 }
705 }
706 }
707 for data in self.child_data.iter_mut() {
709 data.align_buffers()
710 }
711 }
712
713 pub fn validate(&self) -> Result<(), ArrowError> {
724 let len_plus_offset = self.len + self.offset;
726
727 let layout = layout(&self.data_type);
729
730 if !layout.can_contain_null_mask && self.nulls.is_some() {
731 return Err(ArrowError::InvalidArgumentError(format!(
732 "Arrays of type {:?} cannot contain a null bitmask",
733 self.data_type,
734 )));
735 }
736
737 if self.buffers.len() < layout.buffers.len()
739 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
740 {
741 return Err(ArrowError::InvalidArgumentError(format!(
742 "Expected {} buffers in array of type {:?}, got {}",
743 layout.buffers.len(),
744 self.data_type,
745 self.buffers.len(),
746 )));
747 }
748
749 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
750 match spec {
751 BufferSpec::FixedWidth {
752 byte_width,
753 alignment,
754 } => {
755 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
756
757 if buffer.len() < min_buffer_size {
758 return Err(ArrowError::InvalidArgumentError(format!(
759 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
760 min_buffer_size, i, self.data_type, buffer.len()
761 )));
762 }
763
764 let align_offset = buffer.as_ptr().align_offset(*alignment);
765 if align_offset != 0 {
766 return Err(ArrowError::InvalidArgumentError(format!(
767 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
768 self.data_type, align_offset.min(alignment - align_offset)
769 )));
770 }
771 }
772 BufferSpec::VariableWidth => {
773 }
777 BufferSpec::BitMap => {
778 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
779 if buffer.len() < min_buffer_size {
780 return Err(ArrowError::InvalidArgumentError(format!(
781 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
782 min_buffer_size, i, self.data_type, buffer.len()
783 )));
784 }
785 }
786 BufferSpec::AlwaysNull => {
787 }
789 }
790 }
791
792 if let Some(nulls) = self.nulls() {
794 if nulls.null_count() > self.len {
795 return Err(ArrowError::InvalidArgumentError(format!(
796 "null_count {} for an array exceeds length of {} elements",
797 nulls.null_count(),
798 self.len
799 )));
800 }
801
802 let actual_len = nulls.validity().len();
803 let needed_len = bit_util::ceil(len_plus_offset, 8);
804 if actual_len < needed_len {
805 return Err(ArrowError::InvalidArgumentError(format!(
806 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
807 )));
808 }
809
810 if nulls.len() != self.len {
811 return Err(ArrowError::InvalidArgumentError(format!(
812 "null buffer incorrect size. got {} expected {}",
813 nulls.len(),
814 self.len
815 )));
816 }
817 }
818
819 self.validate_child_data()?;
820
821 match &self.data_type {
823 DataType::Utf8 | DataType::Binary => {
824 self.validate_offsets::<i32>(self.buffers[1].len())?;
825 }
826 DataType::LargeUtf8 | DataType::LargeBinary => {
827 self.validate_offsets::<i64>(self.buffers[1].len())?;
828 }
829 DataType::Dictionary(key_type, _value_type) => {
830 if !DataType::is_dictionary_key_type(key_type) {
832 return Err(ArrowError::InvalidArgumentError(format!(
833 "Dictionary key type must be integer, but was {key_type}"
834 )));
835 }
836 }
837 DataType::RunEndEncoded(run_ends_type, _) => {
838 if run_ends_type.is_nullable() {
839 return Err(ArrowError::InvalidArgumentError(
840 "The nullable should be set to false for the field defining run_ends array.".to_string()
841 ));
842 }
843 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
844 return Err(ArrowError::InvalidArgumentError(format!(
845 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
846 run_ends_type.data_type()
847 )));
848 }
849 }
850 _ => {}
851 };
852
853 Ok(())
854 }
855
856 fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
863 if self.len == 0 && self.buffers[0].is_empty() {
865 return Ok(&[]);
866 }
867
868 self.typed_buffer(0, self.len + 1)
869 }
870
871 fn typed_buffer<T: ArrowNativeType + num::Num>(
873 &self,
874 idx: usize,
875 len: usize,
876 ) -> Result<&[T], ArrowError> {
877 let buffer = &self.buffers[idx];
878
879 let required_len = (len + self.offset) * mem::size_of::<T>();
880
881 if buffer.len() < required_len {
882 return Err(ArrowError::InvalidArgumentError(format!(
883 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
884 idx,
885 self.data_type,
886 required_len,
887 buffer.len()
888 )));
889 }
890
891 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
892 }
893
894 fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
897 &self,
898 values_length: usize,
899 ) -> Result<(), ArrowError> {
900 let offsets = self.typed_offsets::<T>()?;
902 if offsets.is_empty() {
903 return Ok(());
904 }
905
906 let first_offset = offsets[0].to_usize().ok_or_else(|| {
907 ArrowError::InvalidArgumentError(format!(
908 "Error converting offset[0] ({}) to usize for {}",
909 offsets[0], self.data_type
910 ))
911 })?;
912
913 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
914 ArrowError::InvalidArgumentError(format!(
915 "Error converting offset[{}] ({}) to usize for {}",
916 self.len, offsets[self.len], self.data_type
917 ))
918 })?;
919
920 if first_offset > values_length {
921 return Err(ArrowError::InvalidArgumentError(format!(
922 "First offset {} of {} is larger than values length {}",
923 first_offset, self.data_type, values_length,
924 )));
925 }
926
927 if last_offset > values_length {
928 return Err(ArrowError::InvalidArgumentError(format!(
929 "Last offset {} of {} is larger than values length {}",
930 last_offset, self.data_type, values_length,
931 )));
932 }
933
934 if first_offset > last_offset {
935 return Err(ArrowError::InvalidArgumentError(format!(
936 "First offset {} in {} is smaller than last offset {}",
937 first_offset, self.data_type, last_offset,
938 )));
939 }
940
941 Ok(())
942 }
943
944 fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
947 &self,
948 values_length: usize,
949 ) -> Result<(), ArrowError> {
950 let offsets: &[T] = self.typed_buffer(0, self.len)?;
951 let sizes: &[T] = self.typed_buffer(1, self.len)?;
952 for i in 0..values_length {
953 let size = sizes[i].to_usize().ok_or_else(|| {
954 ArrowError::InvalidArgumentError(format!(
955 "Error converting size[{}] ({}) to usize for {}",
956 i, sizes[i], self.data_type
957 ))
958 })?;
959 let offset = offsets[i].to_usize().ok_or_else(|| {
960 ArrowError::InvalidArgumentError(format!(
961 "Error converting offset[{}] ({}) to usize for {}",
962 i, offsets[i], self.data_type
963 ))
964 })?;
965 if size
966 .checked_add(offset)
967 .expect("Offset and size have exceeded the usize boundary")
968 > values_length
969 {
970 return Err(ArrowError::InvalidArgumentError(format!(
971 "Size {} at index {} is larger than the remaining values for {}",
972 size, i, self.data_type
973 )));
974 }
975 }
976 Ok(())
977 }
978
979 fn validate_child_data(&self) -> Result<(), ArrowError> {
981 match &self.data_type {
982 DataType::List(field) | DataType::Map(field, _) => {
983 let values_data = self.get_single_valid_child_data(field.data_type())?;
984 self.validate_offsets::<i32>(values_data.len)?;
985 Ok(())
986 }
987 DataType::LargeList(field) => {
988 let values_data = self.get_single_valid_child_data(field.data_type())?;
989 self.validate_offsets::<i64>(values_data.len)?;
990 Ok(())
991 }
992 DataType::ListView(field) => {
993 let values_data = self.get_single_valid_child_data(field.data_type())?;
994 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
995 Ok(())
996 }
997 DataType::LargeListView(field) => {
998 let values_data = self.get_single_valid_child_data(field.data_type())?;
999 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1000 Ok(())
1001 }
1002 DataType::FixedSizeList(field, list_size) => {
1003 let values_data = self.get_single_valid_child_data(field.data_type())?;
1004
1005 let list_size: usize = (*list_size).try_into().map_err(|_| {
1006 ArrowError::InvalidArgumentError(format!(
1007 "{} has a negative list_size {}",
1008 self.data_type, list_size
1009 ))
1010 })?;
1011
1012 let expected_values_len = self.len
1013 .checked_mul(list_size)
1014 .expect("integer overflow computing expected number of expected values in FixedListSize");
1015
1016 if values_data.len < expected_values_len {
1017 return Err(ArrowError::InvalidArgumentError(format!(
1018 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1019 values_data.len, list_size, list_size, self.data_type
1020 )));
1021 }
1022
1023 Ok(())
1024 }
1025 DataType::Struct(fields) => {
1026 self.validate_num_child_data(fields.len())?;
1027 for (i, field) in fields.iter().enumerate() {
1028 let field_data = self.get_valid_child_data(i, field.data_type())?;
1029
1030 if field_data.len < self.len {
1032 return Err(ArrowError::InvalidArgumentError(format!(
1033 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1034 self.data_type, i, field.name(), field_data.len, self.len
1035 )));
1036 }
1037 }
1038 Ok(())
1039 }
1040 DataType::RunEndEncoded(run_ends_field, values_field) => {
1041 self.validate_num_child_data(2)?;
1042 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1043 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1044 if run_ends_data.len != values_data.len {
1045 return Err(ArrowError::InvalidArgumentError(format!(
1046 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1047 run_ends_data.len, values_data.len
1048 )));
1049 }
1050 if run_ends_data.nulls.is_some() {
1051 return Err(ArrowError::InvalidArgumentError(
1052 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1053 ));
1054 }
1055 Ok(())
1056 }
1057 DataType::Union(fields, mode) => {
1058 self.validate_num_child_data(fields.len())?;
1059
1060 for (i, (_, field)) in fields.iter().enumerate() {
1061 let field_data = self.get_valid_child_data(i, field.data_type())?;
1062
1063 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1064 return Err(ArrowError::InvalidArgumentError(format!(
1065 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1066 i, field_data.len, self.len + self.offset
1067 )));
1068 }
1069 }
1070 Ok(())
1071 }
1072 DataType::Dictionary(_key_type, value_type) => {
1073 self.get_single_valid_child_data(value_type)?;
1074 Ok(())
1075 }
1076 _ => {
1077 if !self.child_data.is_empty() {
1079 return Err(ArrowError::InvalidArgumentError(format!(
1080 "Expected no child arrays for type {} but got {}",
1081 self.data_type,
1082 self.child_data.len()
1083 )));
1084 }
1085 Ok(())
1086 }
1087 }
1088 }
1089
1090 fn get_single_valid_child_data(
1094 &self,
1095 expected_type: &DataType,
1096 ) -> Result<&ArrayData, ArrowError> {
1097 self.validate_num_child_data(1)?;
1098 self.get_valid_child_data(0, expected_type)
1099 }
1100
1101 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1103 if self.child_data.len() != expected_len {
1104 Err(ArrowError::InvalidArgumentError(format!(
1105 "Value data for {} should contain {} child data array(s), had {}",
1106 self.data_type,
1107 expected_len,
1108 self.child_data.len()
1109 )))
1110 } else {
1111 Ok(())
1112 }
1113 }
1114
1115 fn get_valid_child_data(
1118 &self,
1119 i: usize,
1120 expected_type: &DataType,
1121 ) -> Result<&ArrayData, ArrowError> {
1122 let values_data = self.child_data.get(i).ok_or_else(|| {
1123 ArrowError::InvalidArgumentError(format!(
1124 "{} did not have enough child arrays. Expected at least {} but had only {}",
1125 self.data_type,
1126 i + 1,
1127 self.child_data.len()
1128 ))
1129 })?;
1130
1131 if expected_type != &values_data.data_type {
1132 return Err(ArrowError::InvalidArgumentError(format!(
1133 "Child type mismatch for {}. Expected {} but child data had {}",
1134 self.data_type, expected_type, values_data.data_type
1135 )));
1136 }
1137
1138 values_data.validate()?;
1139 Ok(values_data)
1140 }
1141
1142 pub fn validate_data(&self) -> Result<(), ArrowError> {
1158 self.validate()?;
1159
1160 self.validate_nulls()?;
1161 self.validate_values()?;
1162 Ok(())
1163 }
1164
1165 pub fn validate_full(&self) -> Result<(), ArrowError> {
1170 self.validate_data()?;
1171 self.child_data
1173 .iter()
1174 .enumerate()
1175 .try_for_each(|(i, child_data)| {
1176 child_data.validate_full().map_err(|e| {
1177 ArrowError::InvalidArgumentError(format!(
1178 "{} child #{} invalid: {}",
1179 self.data_type, i, e
1180 ))
1181 })
1182 })?;
1183 Ok(())
1184 }
1185
1186 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1196 if let Some(nulls) = &self.nulls {
1197 let actual = nulls.len() - nulls.inner().count_set_bits();
1198 if actual != nulls.null_count() {
1199 return Err(ArrowError::InvalidArgumentError(format!(
1200 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1201 nulls.null_count(),
1202 actual
1203 )));
1204 }
1205 }
1206
1207 match &self.data_type {
1212 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1213 if !f.is_nullable() {
1214 self.validate_non_nullable(None, &self.child_data[0])?
1215 }
1216 }
1217 DataType::FixedSizeList(field, len) => {
1218 let child = &self.child_data[0];
1219 if !field.is_nullable() {
1220 match &self.nulls {
1221 Some(nulls) => {
1222 let element_len = *len as usize;
1223 let expanded = nulls.expand(element_len);
1224 self.validate_non_nullable(Some(&expanded), child)?;
1225 }
1226 None => self.validate_non_nullable(None, child)?,
1227 }
1228 }
1229 }
1230 DataType::Struct(fields) => {
1231 for (field, child) in fields.iter().zip(&self.child_data) {
1232 if !field.is_nullable() {
1233 self.validate_non_nullable(self.nulls(), child)?
1234 }
1235 }
1236 }
1237 _ => {}
1238 }
1239
1240 Ok(())
1241 }
1242
1243 fn validate_non_nullable(
1245 &self,
1246 mask: Option<&NullBuffer>,
1247 child: &ArrayData,
1248 ) -> Result<(), ArrowError> {
1249 let mask = match mask {
1250 Some(mask) => mask,
1251 None => {
1252 return match child.null_count() {
1253 0 => Ok(()),
1254 _ => Err(ArrowError::InvalidArgumentError(format!(
1255 "non-nullable child of type {} contains nulls not present in parent {}",
1256 child.data_type, self.data_type
1257 ))),
1258 }
1259 }
1260 };
1261
1262 match child.nulls() {
1263 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1264 "non-nullable child of type {} contains nulls not present in parent",
1265 child.data_type
1266 ))),
1267 _ => Ok(()),
1268 }
1269 }
1270
1271 pub fn validate_values(&self) -> Result<(), ArrowError> {
1277 match &self.data_type {
1278 DataType::Utf8 => self.validate_utf8::<i32>(),
1279 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1280 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1281 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1282 DataType::BinaryView => {
1283 let views = self.typed_buffer::<u128>(0, self.len)?;
1284 validate_binary_view(views, &self.buffers[1..])
1285 }
1286 DataType::Utf8View => {
1287 let views = self.typed_buffer::<u128>(0, self.len)?;
1288 validate_string_view(views, &self.buffers[1..])
1289 }
1290 DataType::List(_) | DataType::Map(_, _) => {
1291 let child = &self.child_data[0];
1292 self.validate_offsets_full::<i32>(child.len)
1293 }
1294 DataType::LargeList(_) => {
1295 let child = &self.child_data[0];
1296 self.validate_offsets_full::<i64>(child.len)
1297 }
1298 DataType::Union(_, _) => {
1299 Ok(())
1305 }
1306 DataType::Dictionary(key_type, _value_type) => {
1307 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1308 let max_value = dictionary_length - 1;
1309 match key_type.as_ref() {
1310 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1311 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1312 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1313 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1314 DataType::Int8 => self.check_bounds::<i8>(max_value),
1315 DataType::Int16 => self.check_bounds::<i16>(max_value),
1316 DataType::Int32 => self.check_bounds::<i32>(max_value),
1317 DataType::Int64 => self.check_bounds::<i64>(max_value),
1318 _ => unreachable!(),
1319 }
1320 }
1321 DataType::RunEndEncoded(run_ends, _values) => {
1322 let run_ends_data = self.child_data()[0].clone();
1323 match run_ends.data_type() {
1324 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1325 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1326 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1327 _ => unreachable!(),
1328 }
1329 }
1330 _ => {
1331 Ok(())
1333 }
1334 }
1335 }
1336
1337 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1348 where
1349 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1350 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1351 {
1352 self.typed_offsets::<T>()?
1353 .iter()
1354 .enumerate()
1355 .map(|(i, x)| {
1356 let r = x.to_usize().ok_or_else(|| {
1358 ArrowError::InvalidArgumentError(format!(
1359 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1360 );
1361 match r {
1363 Ok(n) if n <= offset_limit => Ok((i, n)),
1364 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1365 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1366 ),
1367 Err(e) => Err(e),
1368 }
1369 })
1370 .scan(0_usize, |start, end| {
1371 match end {
1373 Ok((i, end)) if *start <= end => {
1374 let range = Some(Ok((i, *start..end)));
1375 *start = end;
1376 range
1377 }
1378 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1379 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1380 i - 1, start, end))
1381 )),
1382 Err(err) => Some(Err(err)),
1383 }
1384 })
1385 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1387 let (item_index, range) = res?;
1388 validate(item_index-1, range)
1389 })
1390 }
1391
1392 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1395 where
1396 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1397 {
1398 let values_buffer = &self.buffers[1].as_slice();
1399 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1400 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1402 if !values_str.is_char_boundary(range.start)
1403 || !values_str.is_char_boundary(range.end)
1404 {
1405 return Err(ArrowError::InvalidArgumentError(format!(
1406 "incomplete utf-8 byte sequence from index {string_index}"
1407 )));
1408 }
1409 Ok(())
1410 })
1411 } else {
1412 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1414 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1415 ArrowError::InvalidArgumentError(format!(
1416 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1417 ))
1418 })?;
1419 Ok(())
1420 })
1421 }
1422 }
1423
1424 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1427 where
1428 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1429 {
1430 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1431 Ok(())
1434 })
1435 }
1436
1437 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1440 where
1441 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1442 {
1443 let required_len = self.len + self.offset;
1444 let buffer = &self.buffers[0];
1445
1446 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1449
1450 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1452
1453 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1454 if self.is_null(i) {
1456 return Ok(());
1457 }
1458 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1459 ArrowError::InvalidArgumentError(format!(
1460 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1461 ))
1462 })?;
1463
1464 if dict_index < 0 || dict_index > max_value {
1465 return Err(ArrowError::InvalidArgumentError(format!(
1466 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1467 )));
1468 }
1469 Ok(())
1470 })
1471 }
1472
1473 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1475 where
1476 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1477 {
1478 let values = self.typed_buffer::<T>(0, self.len)?;
1479 let mut prev_value: i64 = 0_i64;
1480 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1481 let value: i64 = inp_value.try_into().map_err(|_| {
1482 ArrowError::InvalidArgumentError(format!(
1483 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1484 ))
1485 })?;
1486 if value <= 0_i64 {
1487 return Err(ArrowError::InvalidArgumentError(format!(
1488 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1489 )));
1490 }
1491 if ix > 0 && value <= prev_value {
1492 return Err(ArrowError::InvalidArgumentError(format!(
1493 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1494 )));
1495 }
1496
1497 prev_value = value;
1498 Ok(())
1499 })?;
1500
1501 if prev_value.as_usize() < (self.offset + self.len) {
1502 return Err(ArrowError::InvalidArgumentError(format!(
1503 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1504 self.offset + self.len
1505 )));
1506 }
1507 Ok(())
1508 }
1509
1510 pub fn ptr_eq(&self, other: &Self) -> bool {
1514 if self.offset != other.offset
1515 || self.len != other.len
1516 || self.data_type != other.data_type
1517 || self.buffers.len() != other.buffers.len()
1518 || self.child_data.len() != other.child_data.len()
1519 {
1520 return false;
1521 }
1522
1523 match (&self.nulls, &other.nulls) {
1524 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1525 (Some(_), None) | (None, Some(_)) => return false,
1526 _ => {}
1527 };
1528
1529 if !self
1530 .buffers
1531 .iter()
1532 .zip(other.buffers.iter())
1533 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1534 {
1535 return false;
1536 }
1537
1538 self.child_data
1539 .iter()
1540 .zip(other.child_data.iter())
1541 .all(|(a, b)| a.ptr_eq(b))
1542 }
1543
1544 pub fn into_builder(self) -> ArrayDataBuilder {
1546 self.into()
1547 }
1548}
1549
1550pub fn layout(data_type: &DataType) -> DataTypeLayout {
1553 use arrow_schema::IntervalUnit::*;
1556
1557 match data_type {
1558 DataType::Null => DataTypeLayout {
1559 buffers: vec![],
1560 can_contain_null_mask: false,
1561 variadic: false,
1562 },
1563 DataType::Boolean => DataTypeLayout {
1564 buffers: vec![BufferSpec::BitMap],
1565 can_contain_null_mask: true,
1566 variadic: false,
1567 },
1568 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1569 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1570 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1571 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1572 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1573 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1574 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1575 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1576 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1577 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1578 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1579 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1580 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1581 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1582 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1583 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1584 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1585 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1586 DataType::Interval(MonthDayNano) => {
1587 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1588 }
1589 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1590 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1591 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1592 DataType::FixedSizeBinary(size) => {
1593 let spec = BufferSpec::FixedWidth {
1594 byte_width: (*size).try_into().unwrap(),
1595 alignment: mem::align_of::<u8>(),
1596 };
1597 DataTypeLayout {
1598 buffers: vec![spec],
1599 can_contain_null_mask: true,
1600 variadic: false,
1601 }
1602 }
1603 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1604 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1605 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1606 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1607 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1608 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1610 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1611 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1612 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1613 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1614 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1617 let type_ids = BufferSpec::FixedWidth {
1618 byte_width: mem::size_of::<i8>(),
1619 alignment: mem::align_of::<i8>(),
1620 };
1621
1622 DataTypeLayout {
1623 buffers: match mode {
1624 UnionMode::Sparse => {
1625 vec![type_ids]
1626 }
1627 UnionMode::Dense => {
1628 vec![
1629 type_ids,
1630 BufferSpec::FixedWidth {
1631 byte_width: mem::size_of::<i32>(),
1632 alignment: mem::align_of::<i32>(),
1633 },
1634 ]
1635 }
1636 },
1637 can_contain_null_mask: false,
1638 variadic: false,
1639 }
1640 }
1641 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1642 }
1643}
1644
1645#[derive(Debug, PartialEq, Eq)]
1647pub struct DataTypeLayout {
1649 pub buffers: Vec<BufferSpec>,
1651
1652 pub can_contain_null_mask: bool,
1654
1655 pub variadic: bool,
1659}
1660
1661impl DataTypeLayout {
1662 pub fn new_fixed_width<T>() -> Self {
1664 Self {
1665 buffers: vec![BufferSpec::FixedWidth {
1666 byte_width: mem::size_of::<T>(),
1667 alignment: mem::align_of::<T>(),
1668 }],
1669 can_contain_null_mask: true,
1670 variadic: false,
1671 }
1672 }
1673
1674 pub fn new_nullable_empty() -> Self {
1677 Self {
1678 buffers: vec![],
1679 can_contain_null_mask: true,
1680 variadic: false,
1681 }
1682 }
1683
1684 pub fn new_empty() -> Self {
1687 Self {
1688 buffers: vec![],
1689 can_contain_null_mask: false,
1690 variadic: false,
1691 }
1692 }
1693
1694 pub fn new_binary<T>() -> Self {
1698 Self {
1699 buffers: vec![
1700 BufferSpec::FixedWidth {
1702 byte_width: mem::size_of::<T>(),
1703 alignment: mem::align_of::<T>(),
1704 },
1705 BufferSpec::VariableWidth,
1707 ],
1708 can_contain_null_mask: true,
1709 variadic: false,
1710 }
1711 }
1712
1713 pub fn new_view() -> Self {
1715 Self {
1716 buffers: vec![BufferSpec::FixedWidth {
1717 byte_width: mem::size_of::<u128>(),
1718 alignment: mem::align_of::<u128>(),
1719 }],
1720 can_contain_null_mask: true,
1721 variadic: true,
1722 }
1723 }
1724
1725 pub fn new_list_view<T>() -> Self {
1727 Self {
1728 buffers: vec![
1729 BufferSpec::FixedWidth {
1730 byte_width: mem::size_of::<T>(),
1731 alignment: mem::align_of::<T>(),
1732 },
1733 BufferSpec::FixedWidth {
1734 byte_width: mem::size_of::<T>(),
1735 alignment: mem::align_of::<T>(),
1736 },
1737 ],
1738 can_contain_null_mask: true,
1739 variadic: true,
1740 }
1741 }
1742}
1743
1744#[derive(Debug, PartialEq, Eq)]
1746pub enum BufferSpec {
1747 FixedWidth {
1758 byte_width: usize,
1760 alignment: usize,
1762 },
1763 VariableWidth,
1765 BitMap,
1771 #[allow(dead_code)]
1774 AlwaysNull,
1775}
1776
1777impl PartialEq for ArrayData {
1778 fn eq(&self, other: &Self) -> bool {
1779 equal::equal(self, other)
1780 }
1781}
1782
1783#[derive(Debug, Clone)]
1802#[doc(hidden)]
1803pub struct UnsafeFlag(bool);
1804
1805impl UnsafeFlag {
1806 #[inline]
1810 pub const fn new() -> Self {
1811 Self(false)
1812 }
1813
1814 #[inline]
1824 pub unsafe fn set(&mut self, val: bool) {
1825 self.0 = val;
1826 }
1827
1828 #[inline]
1830 pub fn get(&self) -> bool {
1831 self.0
1832 }
1833}
1834
1835impl Default for UnsafeFlag {
1837 fn default() -> Self {
1838 Self::new()
1839 }
1840}
1841
1842#[derive(Debug)]
1844pub struct ArrayDataBuilder {
1845 data_type: DataType,
1846 len: usize,
1847 null_count: Option<usize>,
1848 null_bit_buffer: Option<Buffer>,
1849 nulls: Option<NullBuffer>,
1850 offset: usize,
1851 buffers: Vec<Buffer>,
1852 child_data: Vec<ArrayData>,
1853 align_buffers: bool,
1857 skip_validation: UnsafeFlag,
1867}
1868
1869impl ArrayDataBuilder {
1870 #[inline]
1871 pub const fn new(data_type: DataType) -> Self {
1873 Self {
1874 data_type,
1875 len: 0,
1876 null_count: None,
1877 null_bit_buffer: None,
1878 nulls: None,
1879 offset: 0,
1880 buffers: vec![],
1881 child_data: vec![],
1882 align_buffers: false,
1883 skip_validation: UnsafeFlag::new(),
1884 }
1885 }
1886
1887 pub fn data_type(self, data_type: DataType) -> Self {
1889 Self { data_type, ..self }
1890 }
1891
1892 #[inline]
1893 #[allow(clippy::len_without_is_empty)]
1894 pub const fn len(mut self, n: usize) -> Self {
1896 self.len = n;
1897 self
1898 }
1899
1900 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1902 self.nulls = nulls;
1903 self.null_count = None;
1904 self.null_bit_buffer = None;
1905 self
1906 }
1907
1908 pub fn null_count(mut self, null_count: usize) -> Self {
1910 self.null_count = Some(null_count);
1911 self
1912 }
1913
1914 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1916 self.nulls = None;
1917 self.null_bit_buffer = buf;
1918 self
1919 }
1920
1921 #[inline]
1923 pub const fn offset(mut self, n: usize) -> Self {
1924 self.offset = n;
1925 self
1926 }
1927
1928 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1930 self.buffers = v;
1931 self
1932 }
1933
1934 pub fn add_buffer(mut self, b: Buffer) -> Self {
1936 self.buffers.push(b);
1937 self
1938 }
1939
1940 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1942 self.buffers.extend(bs);
1943 self
1944 }
1945
1946 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1948 self.child_data = v;
1949 self
1950 }
1951
1952 pub fn add_child_data(mut self, r: ArrayData) -> Self {
1954 self.child_data.push(r);
1955 self
1956 }
1957
1958 pub unsafe fn build_unchecked(self) -> ArrayData {
1973 self.skip_validation(true).build().unwrap()
1974 }
1975
1976 pub fn build(self) -> Result<ArrayData, ArrowError> {
1985 let Self {
1986 data_type,
1987 len,
1988 null_count,
1989 null_bit_buffer,
1990 nulls,
1991 offset,
1992 buffers,
1993 child_data,
1994 align_buffers,
1995 skip_validation,
1996 } = self;
1997
1998 let nulls = nulls
1999 .or_else(|| {
2000 let buffer = null_bit_buffer?;
2001 let buffer = BooleanBuffer::new(buffer, offset, len);
2002 Some(match null_count {
2003 Some(n) => {
2004 unsafe { NullBuffer::new_unchecked(buffer, n) }
2006 }
2007 None => NullBuffer::new(buffer),
2008 })
2009 })
2010 .filter(|b| b.null_count() != 0);
2011
2012 let mut data = ArrayData {
2013 data_type,
2014 len,
2015 offset,
2016 buffers,
2017 child_data,
2018 nulls,
2019 };
2020
2021 if align_buffers {
2022 data.align_buffers();
2023 }
2024
2025 if !skip_validation.get() || cfg!(feature = "force_validate") {
2027 data.validate_data()?;
2028 }
2029 Ok(data)
2030 }
2031
2032 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2034 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2035 self.align_buffers(true).build()
2036 }
2037
2038 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2054 self.align_buffers = align_buffers;
2055 self
2056 }
2057
2058 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2072 self.skip_validation.set(skip_validation);
2073 self
2074 }
2075}
2076
2077impl From<ArrayData> for ArrayDataBuilder {
2078 fn from(d: ArrayData) -> Self {
2079 Self {
2080 data_type: d.data_type,
2081 len: d.len,
2082 offset: d.offset,
2083 buffers: d.buffers,
2084 child_data: d.child_data,
2085 nulls: d.nulls,
2086 null_bit_buffer: None,
2087 null_count: None,
2088 align_buffers: false,
2089 skip_validation: UnsafeFlag::new(),
2090 }
2091 }
2092}
2093
2094#[cfg(test)]
2095mod tests {
2096 use super::*;
2097 use arrow_schema::{Field, Fields};
2098
2099 fn make_i32_buffer(n: usize) -> Buffer {
2103 Buffer::from_slice_ref(vec![42i32; n])
2104 }
2105
2106 fn make_f32_buffer(n: usize) -> Buffer {
2108 Buffer::from_slice_ref(vec![42f32; n])
2109 }
2110
2111 #[test]
2112 fn test_builder() {
2113 let v = (0..25).collect::<Vec<i32>>();
2115 let b1 = Buffer::from_slice_ref(&v);
2116 let arr_data = ArrayData::builder(DataType::Int32)
2117 .len(20)
2118 .offset(5)
2119 .add_buffer(b1)
2120 .null_bit_buffer(Some(Buffer::from([
2121 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2122 ])))
2123 .build()
2124 .unwrap();
2125
2126 assert_eq!(20, arr_data.len());
2127 assert_eq!(10, arr_data.null_count());
2128 assert_eq!(5, arr_data.offset());
2129 assert_eq!(1, arr_data.buffers().len());
2130 assert_eq!(
2131 Buffer::from_slice_ref(&v).as_slice(),
2132 arr_data.buffers()[0].as_slice()
2133 );
2134 }
2135
2136 #[test]
2137 fn test_builder_with_child_data() {
2138 let child_arr_data = ArrayData::try_new(
2139 DataType::Int32,
2140 5,
2141 None,
2142 0,
2143 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2144 vec![],
2145 )
2146 .unwrap();
2147
2148 let field = Arc::new(Field::new("x", DataType::Int32, true));
2149 let data_type = DataType::Struct(vec![field].into());
2150
2151 let arr_data = ArrayData::builder(data_type)
2152 .len(5)
2153 .offset(0)
2154 .add_child_data(child_arr_data.clone())
2155 .build()
2156 .unwrap();
2157
2158 assert_eq!(5, arr_data.len());
2159 assert_eq!(1, arr_data.child_data().len());
2160 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2161 }
2162
2163 #[test]
2164 fn test_null_count() {
2165 let mut bit_v: [u8; 2] = [0; 2];
2166 bit_util::set_bit(&mut bit_v, 0);
2167 bit_util::set_bit(&mut bit_v, 3);
2168 bit_util::set_bit(&mut bit_v, 10);
2169 let arr_data = ArrayData::builder(DataType::Int32)
2170 .len(16)
2171 .add_buffer(make_i32_buffer(16))
2172 .null_bit_buffer(Some(Buffer::from(bit_v)))
2173 .build()
2174 .unwrap();
2175 assert_eq!(13, arr_data.null_count());
2176
2177 let mut bit_v: [u8; 2] = [0; 2];
2179 bit_util::set_bit(&mut bit_v, 0);
2180 bit_util::set_bit(&mut bit_v, 3);
2181 bit_util::set_bit(&mut bit_v, 10);
2182 let arr_data = ArrayData::builder(DataType::Int32)
2183 .len(12)
2184 .offset(2)
2185 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2187 .build()
2188 .unwrap();
2189 assert_eq!(10, arr_data.null_count());
2190 }
2191
2192 #[test]
2193 fn test_null_buffer_ref() {
2194 let mut bit_v: [u8; 2] = [0; 2];
2195 bit_util::set_bit(&mut bit_v, 0);
2196 bit_util::set_bit(&mut bit_v, 3);
2197 bit_util::set_bit(&mut bit_v, 10);
2198 let arr_data = ArrayData::builder(DataType::Int32)
2199 .len(16)
2200 .add_buffer(make_i32_buffer(16))
2201 .null_bit_buffer(Some(Buffer::from(bit_v)))
2202 .build()
2203 .unwrap();
2204 assert!(arr_data.nulls().is_some());
2205 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2206 }
2207
2208 #[test]
2209 fn test_slice() {
2210 let mut bit_v: [u8; 2] = [0; 2];
2211 bit_util::set_bit(&mut bit_v, 0);
2212 bit_util::set_bit(&mut bit_v, 3);
2213 bit_util::set_bit(&mut bit_v, 10);
2214 let data = ArrayData::builder(DataType::Int32)
2215 .len(16)
2216 .add_buffer(make_i32_buffer(16))
2217 .null_bit_buffer(Some(Buffer::from(bit_v)))
2218 .build()
2219 .unwrap();
2220 let new_data = data.slice(1, 15);
2221 assert_eq!(data.len() - 1, new_data.len());
2222 assert_eq!(1, new_data.offset());
2223 assert_eq!(data.null_count(), new_data.null_count());
2224
2225 let new_data = new_data.slice(1, 14);
2227 assert_eq!(data.len() - 2, new_data.len());
2228 assert_eq!(2, new_data.offset());
2229 assert_eq!(data.null_count() - 1, new_data.null_count());
2230 }
2231
2232 #[test]
2233 fn test_equality() {
2234 let int_data = ArrayData::builder(DataType::Int32)
2235 .len(1)
2236 .add_buffer(make_i32_buffer(1))
2237 .build()
2238 .unwrap();
2239
2240 let float_data = ArrayData::builder(DataType::Float32)
2241 .len(1)
2242 .add_buffer(make_f32_buffer(1))
2243 .build()
2244 .unwrap();
2245 assert_ne!(int_data, float_data);
2246 assert!(!int_data.ptr_eq(&float_data));
2247 assert!(int_data.ptr_eq(&int_data));
2248
2249 #[allow(clippy::redundant_clone)]
2250 let int_data_clone = int_data.clone();
2251 assert_eq!(int_data, int_data_clone);
2252 assert!(int_data.ptr_eq(&int_data_clone));
2253 assert!(int_data_clone.ptr_eq(&int_data));
2254
2255 let int_data_slice = int_data_clone.slice(1, 0);
2256 assert!(int_data_slice.ptr_eq(&int_data_slice));
2257 assert!(!int_data.ptr_eq(&int_data_slice));
2258 assert!(!int_data_slice.ptr_eq(&int_data));
2259
2260 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2261 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2262 let string_data = ArrayData::try_new(
2263 DataType::Utf8,
2264 3,
2265 Some(Buffer::from_iter(vec![true, false, true])),
2266 0,
2267 vec![offsets_buffer, data_buffer],
2268 vec![],
2269 )
2270 .unwrap();
2271
2272 assert_ne!(float_data, string_data);
2273 assert!(!float_data.ptr_eq(&string_data));
2274
2275 assert!(string_data.ptr_eq(&string_data));
2276
2277 #[allow(clippy::redundant_clone)]
2278 let string_data_cloned = string_data.clone();
2279 assert!(string_data_cloned.ptr_eq(&string_data));
2280 assert!(string_data.ptr_eq(&string_data_cloned));
2281
2282 let string_data_slice = string_data.slice(1, 2);
2283 assert!(string_data_slice.ptr_eq(&string_data_slice));
2284 assert!(!string_data_slice.ptr_eq(&string_data))
2285 }
2286
2287 #[test]
2288 fn test_slice_memory_size() {
2289 let mut bit_v: [u8; 2] = [0; 2];
2290 bit_util::set_bit(&mut bit_v, 0);
2291 bit_util::set_bit(&mut bit_v, 3);
2292 bit_util::set_bit(&mut bit_v, 10);
2293 let data = ArrayData::builder(DataType::Int32)
2294 .len(16)
2295 .add_buffer(make_i32_buffer(16))
2296 .null_bit_buffer(Some(Buffer::from(bit_v)))
2297 .build()
2298 .unwrap();
2299 let new_data = data.slice(1, 14);
2300 assert_eq!(
2301 data.get_slice_memory_size().unwrap() - 8,
2302 new_data.get_slice_memory_size().unwrap()
2303 );
2304 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2305 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2306 let string_data = ArrayData::try_new(
2307 DataType::Utf8,
2308 3,
2309 Some(Buffer::from_iter(vec![true, false, true])),
2310 0,
2311 vec![offsets_buffer, data_buffer],
2312 vec![],
2313 )
2314 .unwrap();
2315 let string_data_slice = string_data.slice(1, 2);
2316 assert_eq!(
2318 string_data.get_slice_memory_size().unwrap() - 6,
2319 string_data_slice.get_slice_memory_size().unwrap()
2320 );
2321 }
2322
2323 #[test]
2324 fn test_count_nulls() {
2325 let buffer = Buffer::from([0b00010110, 0b10011111]);
2326 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2327 let count = count_nulls(Some(&buffer), 0, 16);
2328 assert_eq!(count, 7);
2329
2330 let count = count_nulls(Some(&buffer), 4, 8);
2331 assert_eq!(count, 3);
2332 }
2333
2334 #[test]
2335 fn test_contains_nulls() {
2336 let buffer: Buffer =
2337 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2338 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2339 assert!(contains_nulls(Some(&buffer), 0, 6));
2340 assert!(contains_nulls(Some(&buffer), 0, 3));
2341 assert!(!contains_nulls(Some(&buffer), 3, 2));
2342 assert!(!contains_nulls(Some(&buffer), 0, 0));
2343 }
2344
2345 #[test]
2346 fn test_alignment() {
2347 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2348 let sliced = buffer.slice(1);
2349
2350 let mut data = ArrayData {
2351 data_type: DataType::Int32,
2352 len: 0,
2353 offset: 0,
2354 buffers: vec![buffer],
2355 child_data: vec![],
2356 nulls: None,
2357 };
2358 data.validate_full().unwrap();
2359
2360 data.buffers[0] = sliced;
2362 let err = data.validate().unwrap_err();
2363
2364 assert_eq!(
2365 err.to_string(),
2366 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2367 );
2368
2369 data.align_buffers();
2370 data.validate_full().unwrap();
2371 }
2372
2373 #[test]
2374 fn test_alignment_struct() {
2375 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2376 let sliced = buffer.slice(1);
2377
2378 let child_data = ArrayData {
2379 data_type: DataType::Int32,
2380 len: 0,
2381 offset: 0,
2382 buffers: vec![buffer],
2383 child_data: vec![],
2384 nulls: None,
2385 };
2386
2387 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2388 let mut data = ArrayData {
2389 data_type: schema,
2390 len: 0,
2391 offset: 0,
2392 buffers: vec![],
2393 child_data: vec![child_data],
2394 nulls: None,
2395 };
2396 data.validate_full().unwrap();
2397
2398 data.child_data[0].buffers[0] = sliced;
2400 let err = data.validate().unwrap_err();
2401
2402 assert_eq!(
2403 err.to_string(),
2404 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2405 );
2406
2407 data.align_buffers();
2408 data.validate_full().unwrap();
2409 }
2410
2411 #[test]
2412 fn test_null_view_types() {
2413 let array_len = 32;
2414 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2415 assert_eq!(array.len(), array_len);
2416 for i in 0..array.len() {
2417 assert!(array.is_null(i));
2418 }
2419
2420 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2421 assert_eq!(array.len(), array_len);
2422 for i in 0..array.len() {
2423 assert!(array.is_null(i));
2424 }
2425 }
2426}