1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 unsafe { skip_validation.set(true) };
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
313 data_type: DataType,
314 len: usize,
315 null_bit_buffer: Option<Buffer>,
316 offset: usize,
317 buffers: Vec<Buffer>,
318 child_data: Vec<ArrayData>,
319 ) -> Result<Self, ArrowError> {
320 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324 let needed_len = bit_util::ceil(len + offset, 8);
325 if null_bit_buffer.len() < needed_len {
326 return Err(ArrowError::InvalidArgumentError(format!(
327 "null_bit_buffer size too small. got {} needed {}",
328 null_bit_buffer.len(),
329 needed_len
330 )));
331 }
332 }
333 let new_self = unsafe {
335 Self::new_unchecked(
336 data_type,
337 len,
338 None,
339 null_bit_buffer,
340 offset,
341 buffers,
342 child_data,
343 )
344 };
345
346 new_self.validate_data()?;
351 Ok(new_self)
352 }
353
354 #[inline]
356 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357 ArrayDataBuilder::new(data_type)
358 }
359
360 #[inline]
362 pub const fn data_type(&self) -> &DataType {
363 &self.data_type
364 }
365
366 pub fn buffers(&self) -> &[Buffer] {
368 &self.buffers
369 }
370
371 pub fn child_data(&self) -> &[ArrayData] {
374 &self.child_data[..]
375 }
376
377 #[inline]
379 pub fn is_null(&self, i: usize) -> bool {
380 match &self.nulls {
381 Some(v) => v.is_null(i),
382 None => false,
383 }
384 }
385
386 #[inline]
390 pub fn nulls(&self) -> Option<&NullBuffer> {
391 self.nulls.as_ref()
392 }
393
394 #[inline]
396 pub fn is_valid(&self, i: usize) -> bool {
397 !self.is_null(i)
398 }
399
400 #[inline]
402 pub const fn len(&self) -> usize {
403 self.len
404 }
405
406 #[inline]
408 pub const fn is_empty(&self) -> bool {
409 self.len == 0
410 }
411
412 #[inline]
414 pub const fn offset(&self) -> usize {
415 self.offset
416 }
417
418 #[inline]
420 pub fn null_count(&self) -> usize {
421 self.nulls
422 .as_ref()
423 .map(|x| x.null_count())
424 .unwrap_or_default()
425 }
426
427 pub fn get_buffer_memory_size(&self) -> usize {
439 let mut size = 0;
440 for buffer in &self.buffers {
441 size += buffer.capacity();
442 }
443 if let Some(bitmap) = &self.nulls {
444 size += bitmap.buffer().capacity()
445 }
446 for child in &self.child_data {
447 size += child.get_buffer_memory_size();
448 }
449 size
450 }
451
452 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465 let mut result: usize = 0;
466 let layout = layout(&self.data_type);
467
468 for spec in layout.buffers.iter() {
469 match spec {
470 BufferSpec::FixedWidth { byte_width, .. } => {
471 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472 ArrowError::ComputeError(
473 "Integer overflow computing buffer size".to_string(),
474 )
475 })?;
476 result += buffer_size;
477 }
478 BufferSpec::VariableWidth => {
479 let buffer_len = match self.data_type {
480 DataType::Utf8 | DataType::Binary => {
481 let offsets = self.typed_offsets::<i32>()?;
482 (offsets[self.len] - offsets[0]) as usize
483 }
484 DataType::LargeUtf8 | DataType::LargeBinary => {
485 let offsets = self.typed_offsets::<i64>()?;
486 (offsets[self.len] - offsets[0]) as usize
487 }
488 _ => {
489 return Err(ArrowError::NotYetImplemented(format!(
490 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491 self.data_type
492 )));
493 }
494 };
495 result += buffer_len;
496 }
497 BufferSpec::BitMap => {
498 let buffer_size = bit_util::ceil(self.len, 8);
499 result += buffer_size;
500 }
501 BufferSpec::AlwaysNull => {
502 }
504 }
505 }
506
507 if self.nulls().is_some() {
508 result += bit_util::ceil(self.len, 8);
509 }
510
511 for child in &self.child_data {
512 result += child.get_slice_memory_size()?;
513 }
514 Ok(result)
515 }
516
517 pub fn get_array_memory_size(&self) -> usize {
526 let mut size = mem::size_of_val(self);
527
528 for buffer in &self.buffers {
530 size += mem::size_of::<Buffer>();
531 size += buffer.capacity();
532 }
533 if let Some(nulls) = &self.nulls {
534 size += nulls.buffer().capacity();
535 }
536 for child in &self.child_data {
537 size += child.get_array_memory_size();
538 }
539
540 size
541 }
542
543 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551 assert!((offset + length) <= self.len());
552
553 if let DataType::Struct(_) = self.data_type() {
554 let new_offset = self.offset + offset;
556 ArrayData {
557 data_type: self.data_type().clone(),
558 len: length,
559 offset: new_offset,
560 buffers: self.buffers.clone(),
561 child_data: self
563 .child_data()
564 .iter()
565 .map(|data| data.slice(offset, length))
566 .collect(),
567 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568 }
569 } else {
570 let mut new_data = self.clone();
571
572 new_data.len = length;
573 new_data.offset = offset + self.offset;
574 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
575
576 new_data
577 }
578 }
579
580 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587 &self.buffers()[buffer].typed_data()[self.offset..]
588 }
589
590 pub fn new_null(data_type: &DataType, len: usize) -> Self {
592 let bit_len = bit_util::ceil(len, 8);
593 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596 Some(width) => (vec![zeroed(width * len)], vec![], true),
597 None => match data_type {
598 DataType::Null => (vec![], vec![], false),
599 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600 DataType::Binary | DataType::Utf8 => {
601 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602 }
603 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604 DataType::LargeBinary | DataType::LargeUtf8 => {
605 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606 }
607 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608 DataType::List(f) | DataType::Map(f, _) => (
609 vec![zeroed((len + 1) * 4)],
610 vec![ArrayData::new_empty(f.data_type())],
611 true,
612 ),
613 DataType::LargeList(f) => (
614 vec![zeroed((len + 1) * 8)],
615 vec![ArrayData::new_empty(f.data_type())],
616 true,
617 ),
618 DataType::FixedSizeList(f, list_len) => (
619 vec![],
620 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
621 true,
622 ),
623 DataType::Struct(fields) => (
624 vec![],
625 fields
626 .iter()
627 .map(|f| Self::new_null(f.data_type(), len))
628 .collect(),
629 true,
630 ),
631 DataType::Dictionary(k, v) => (
632 vec![zeroed(k.primitive_width().unwrap() * len)],
633 vec![ArrayData::new_empty(v.as_ref())],
634 true,
635 ),
636 DataType::Union(f, mode) => {
637 let (id, _) = f.iter().next().unwrap();
638 let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
639 let buffers = match mode {
640 UnionMode::Sparse => vec![ids],
641 UnionMode::Dense => {
642 let end_offset = i32::from_usize(len).unwrap();
643 vec![ids, Buffer::from_iter(0_i32..end_offset)]
644 }
645 };
646
647 let children = f
648 .iter()
649 .enumerate()
650 .map(|(idx, (_, f))| {
651 if idx == 0 || *mode == UnionMode::Sparse {
652 Self::new_null(f.data_type(), len)
653 } else {
654 Self::new_empty(f.data_type())
655 }
656 })
657 .collect();
658
659 (buffers, children, false)
660 }
661 DataType::RunEndEncoded(r, v) => {
662 let runs = match r.data_type() {
663 DataType::Int16 => {
664 let i = i16::from_usize(len).expect("run overflow");
665 Buffer::from_slice_ref([i])
666 }
667 DataType::Int32 => {
668 let i = i32::from_usize(len).expect("run overflow");
669 Buffer::from_slice_ref([i])
670 }
671 DataType::Int64 => {
672 let i = i64::from_usize(len).expect("run overflow");
673 Buffer::from_slice_ref([i])
674 }
675 dt => unreachable!("Invalid run ends data type {dt}"),
676 };
677
678 let builder = ArrayData::builder(r.data_type().clone())
679 .len(1)
680 .buffers(vec![runs]);
681
682 let runs = unsafe { builder.build_unchecked() };
685 (
686 vec![],
687 vec![runs, ArrayData::new_null(v.data_type(), 1)],
688 false,
689 )
690 }
691 d => unreachable!("{d}"),
692 },
693 };
694
695 let mut builder = ArrayDataBuilder::new(data_type.clone())
696 .len(len)
697 .buffers(buffers)
698 .child_data(child_data);
699
700 if has_nulls {
701 builder = builder.nulls(Some(NullBuffer::new_null(len)))
702 }
703
704 unsafe { builder.build_unchecked() }
707 }
708
709 pub fn new_empty(data_type: &DataType) -> Self {
711 Self::new_null(data_type, 0)
712 }
713
714 pub fn align_buffers(&mut self) {
723 let layout = layout(&self.data_type);
724 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
725 if let BufferSpec::FixedWidth { alignment, .. } = spec {
726 if buffer.as_ptr().align_offset(*alignment) != 0 {
727 *buffer = Buffer::from_slice_ref(buffer.as_ref());
728 }
729 }
730 }
731 for data in self.child_data.iter_mut() {
733 data.align_buffers()
734 }
735 }
736
737 pub fn validate(&self) -> Result<(), ArrowError> {
748 let len_plus_offset = self.len + self.offset;
750
751 let layout = layout(&self.data_type);
753
754 if !layout.can_contain_null_mask && self.nulls.is_some() {
755 return Err(ArrowError::InvalidArgumentError(format!(
756 "Arrays of type {:?} cannot contain a null bitmask",
757 self.data_type,
758 )));
759 }
760
761 if self.buffers.len() < layout.buffers.len()
763 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
764 {
765 return Err(ArrowError::InvalidArgumentError(format!(
766 "Expected {} buffers in array of type {:?}, got {}",
767 layout.buffers.len(),
768 self.data_type,
769 self.buffers.len(),
770 )));
771 }
772
773 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
774 match spec {
775 BufferSpec::FixedWidth {
776 byte_width,
777 alignment,
778 } => {
779 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
780
781 if buffer.len() < min_buffer_size {
782 return Err(ArrowError::InvalidArgumentError(format!(
783 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
784 min_buffer_size,
785 i,
786 self.data_type,
787 buffer.len()
788 )));
789 }
790
791 let align_offset = buffer.as_ptr().align_offset(*alignment);
792 if align_offset != 0 {
793 return Err(ArrowError::InvalidArgumentError(format!(
794 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795 self.data_type,
796 align_offset.min(alignment - align_offset)
797 )));
798 }
799 }
800 BufferSpec::VariableWidth => {
801 }
805 BufferSpec::BitMap => {
806 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
807 if buffer.len() < min_buffer_size {
808 return Err(ArrowError::InvalidArgumentError(format!(
809 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
810 min_buffer_size,
811 i,
812 self.data_type,
813 buffer.len()
814 )));
815 }
816 }
817 BufferSpec::AlwaysNull => {
818 }
820 }
821 }
822
823 if let Some(nulls) = self.nulls() {
825 if nulls.null_count() > self.len {
826 return Err(ArrowError::InvalidArgumentError(format!(
827 "null_count {} for an array exceeds length of {} elements",
828 nulls.null_count(),
829 self.len
830 )));
831 }
832
833 let actual_len = nulls.validity().len();
834 let needed_len = bit_util::ceil(len_plus_offset, 8);
835 if actual_len < needed_len {
836 return Err(ArrowError::InvalidArgumentError(format!(
837 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
838 )));
839 }
840
841 if nulls.len() != self.len {
842 return Err(ArrowError::InvalidArgumentError(format!(
843 "null buffer incorrect size. got {} expected {}",
844 nulls.len(),
845 self.len
846 )));
847 }
848 }
849
850 self.validate_child_data()?;
851
852 match &self.data_type {
854 DataType::Utf8 | DataType::Binary => {
855 self.validate_offsets::<i32>(self.buffers[1].len())?;
856 }
857 DataType::LargeUtf8 | DataType::LargeBinary => {
858 self.validate_offsets::<i64>(self.buffers[1].len())?;
859 }
860 DataType::Dictionary(key_type, _value_type) => {
861 if !DataType::is_dictionary_key_type(key_type) {
863 return Err(ArrowError::InvalidArgumentError(format!(
864 "Dictionary key type must be integer, but was {key_type}"
865 )));
866 }
867 }
868 DataType::RunEndEncoded(run_ends_type, _) => {
869 if run_ends_type.is_nullable() {
870 return Err(ArrowError::InvalidArgumentError(
871 "The nullable should be set to false for the field defining run_ends array.".to_string()
872 ));
873 }
874 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
875 return Err(ArrowError::InvalidArgumentError(format!(
876 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
877 run_ends_type.data_type()
878 )));
879 }
880 }
881 _ => {}
882 };
883
884 Ok(())
885 }
886
887 fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
894 if self.len == 0 && self.buffers[0].is_empty() {
896 return Ok(&[]);
897 }
898
899 self.typed_buffer(0, self.len + 1)
900 }
901
902 fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
904 &self,
905 idx: usize,
906 len: usize,
907 ) -> Result<&[T], ArrowError> {
908 let buffer = &self.buffers[idx];
909
910 let required_len = (len + self.offset) * mem::size_of::<T>();
911
912 if buffer.len() < required_len {
913 return Err(ArrowError::InvalidArgumentError(format!(
914 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
915 idx,
916 self.data_type,
917 required_len,
918 buffer.len()
919 )));
920 }
921
922 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
923 }
924
925 fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
928 &self,
929 values_length: usize,
930 ) -> Result<(), ArrowError> {
931 let offsets = self.typed_offsets::<T>()?;
933 if offsets.is_empty() {
934 return Ok(());
935 }
936
937 let first_offset = offsets[0].to_usize().ok_or_else(|| {
938 ArrowError::InvalidArgumentError(format!(
939 "Error converting offset[0] ({}) to usize for {}",
940 offsets[0], self.data_type
941 ))
942 })?;
943
944 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
945 ArrowError::InvalidArgumentError(format!(
946 "Error converting offset[{}] ({}) to usize for {}",
947 self.len, offsets[self.len], self.data_type
948 ))
949 })?;
950
951 if first_offset > values_length {
952 return Err(ArrowError::InvalidArgumentError(format!(
953 "First offset {} of {} is larger than values length {}",
954 first_offset, self.data_type, values_length,
955 )));
956 }
957
958 if last_offset > values_length {
959 return Err(ArrowError::InvalidArgumentError(format!(
960 "Last offset {} of {} is larger than values length {}",
961 last_offset, self.data_type, values_length,
962 )));
963 }
964
965 if first_offset > last_offset {
966 return Err(ArrowError::InvalidArgumentError(format!(
967 "First offset {} in {} is smaller than last offset {}",
968 first_offset, self.data_type, last_offset,
969 )));
970 }
971
972 Ok(())
973 }
974
975 fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
978 &self,
979 values_length: usize,
980 ) -> Result<(), ArrowError> {
981 let offsets: &[T] = self.typed_buffer(0, self.len)?;
982 let sizes: &[T] = self.typed_buffer(1, self.len)?;
983 if offsets.len() != sizes.len() {
984 return Err(ArrowError::ComputeError(format!(
985 "ListView offsets len {} does not match sizes len {}",
986 offsets.len(),
987 sizes.len()
988 )));
989 }
990
991 for i in 0..sizes.len() {
992 let size = sizes[i].to_usize().ok_or_else(|| {
993 ArrowError::InvalidArgumentError(format!(
994 "Error converting size[{}] ({}) to usize for {}",
995 i, sizes[i], self.data_type
996 ))
997 })?;
998 let offset = offsets[i].to_usize().ok_or_else(|| {
999 ArrowError::InvalidArgumentError(format!(
1000 "Error converting offset[{}] ({}) to usize for {}",
1001 i, offsets[i], self.data_type
1002 ))
1003 })?;
1004 if size
1005 .checked_add(offset)
1006 .expect("Offset and size have exceeded the usize boundary")
1007 > values_length
1008 {
1009 return Err(ArrowError::InvalidArgumentError(format!(
1010 "Size {} at index {} is larger than the remaining values for {}",
1011 size, i, self.data_type
1012 )));
1013 }
1014 }
1015 Ok(())
1016 }
1017
1018 fn validate_child_data(&self) -> Result<(), ArrowError> {
1020 match &self.data_type {
1021 DataType::List(field) | DataType::Map(field, _) => {
1022 let values_data = self.get_single_valid_child_data(field.data_type())?;
1023 self.validate_offsets::<i32>(values_data.len)?;
1024 Ok(())
1025 }
1026 DataType::LargeList(field) => {
1027 let values_data = self.get_single_valid_child_data(field.data_type())?;
1028 self.validate_offsets::<i64>(values_data.len)?;
1029 Ok(())
1030 }
1031 DataType::ListView(field) => {
1032 let values_data = self.get_single_valid_child_data(field.data_type())?;
1033 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1034 Ok(())
1035 }
1036 DataType::LargeListView(field) => {
1037 let values_data = self.get_single_valid_child_data(field.data_type())?;
1038 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1039 Ok(())
1040 }
1041 DataType::FixedSizeList(field, list_size) => {
1042 let values_data = self.get_single_valid_child_data(field.data_type())?;
1043
1044 let list_size: usize = (*list_size).try_into().map_err(|_| {
1045 ArrowError::InvalidArgumentError(format!(
1046 "{} has a negative list_size {}",
1047 self.data_type, list_size
1048 ))
1049 })?;
1050
1051 let expected_values_len = self.len
1052 .checked_mul(list_size)
1053 .expect("integer overflow computing expected number of expected values in FixedListSize");
1054
1055 if values_data.len < expected_values_len {
1056 return Err(ArrowError::InvalidArgumentError(format!(
1057 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1058 values_data.len, self.len, list_size, self.data_type
1059 )));
1060 }
1061
1062 Ok(())
1063 }
1064 DataType::Struct(fields) => {
1065 self.validate_num_child_data(fields.len())?;
1066 for (i, field) in fields.iter().enumerate() {
1067 let field_data = self.get_valid_child_data(i, field.data_type())?;
1068
1069 if field_data.len < self.len {
1071 return Err(ArrowError::InvalidArgumentError(format!(
1072 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1073 self.data_type,
1074 i,
1075 field.name(),
1076 field_data.len,
1077 self.len
1078 )));
1079 }
1080 }
1081 Ok(())
1082 }
1083 DataType::RunEndEncoded(run_ends_field, values_field) => {
1084 self.validate_num_child_data(2)?;
1085 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1086 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1087 if run_ends_data.len != values_data.len {
1088 return Err(ArrowError::InvalidArgumentError(format!(
1089 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1090 run_ends_data.len, values_data.len
1091 )));
1092 }
1093 if run_ends_data.nulls.is_some() {
1094 return Err(ArrowError::InvalidArgumentError(
1095 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1096 ));
1097 }
1098 Ok(())
1099 }
1100 DataType::Union(fields, mode) => {
1101 self.validate_num_child_data(fields.len())?;
1102
1103 for (i, (_, field)) in fields.iter().enumerate() {
1104 let field_data = self.get_valid_child_data(i, field.data_type())?;
1105
1106 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1107 return Err(ArrowError::InvalidArgumentError(format!(
1108 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1109 i,
1110 field_data.len,
1111 self.len + self.offset
1112 )));
1113 }
1114 }
1115 Ok(())
1116 }
1117 DataType::Dictionary(_key_type, value_type) => {
1118 self.get_single_valid_child_data(value_type)?;
1119 Ok(())
1120 }
1121 _ => {
1122 if !self.child_data.is_empty() {
1124 return Err(ArrowError::InvalidArgumentError(format!(
1125 "Expected no child arrays for type {} but got {}",
1126 self.data_type,
1127 self.child_data.len()
1128 )));
1129 }
1130 Ok(())
1131 }
1132 }
1133 }
1134
1135 fn get_single_valid_child_data(
1139 &self,
1140 expected_type: &DataType,
1141 ) -> Result<&ArrayData, ArrowError> {
1142 self.validate_num_child_data(1)?;
1143 self.get_valid_child_data(0, expected_type)
1144 }
1145
1146 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1148 if self.child_data.len() != expected_len {
1149 Err(ArrowError::InvalidArgumentError(format!(
1150 "Value data for {} should contain {} child data array(s), had {}",
1151 self.data_type,
1152 expected_len,
1153 self.child_data.len()
1154 )))
1155 } else {
1156 Ok(())
1157 }
1158 }
1159
1160 fn get_valid_child_data(
1163 &self,
1164 i: usize,
1165 expected_type: &DataType,
1166 ) -> Result<&ArrayData, ArrowError> {
1167 let values_data = self.child_data.get(i).ok_or_else(|| {
1168 ArrowError::InvalidArgumentError(format!(
1169 "{} did not have enough child arrays. Expected at least {} but had only {}",
1170 self.data_type,
1171 i + 1,
1172 self.child_data.len()
1173 ))
1174 })?;
1175
1176 if expected_type != &values_data.data_type {
1177 return Err(ArrowError::InvalidArgumentError(format!(
1178 "Child type mismatch for {}. Expected {} but child data had {}",
1179 self.data_type, expected_type, values_data.data_type
1180 )));
1181 }
1182
1183 values_data.validate()?;
1184 Ok(values_data)
1185 }
1186
1187 pub fn validate_data(&self) -> Result<(), ArrowError> {
1203 self.validate()?;
1204
1205 self.validate_nulls()?;
1206 self.validate_values()?;
1207 Ok(())
1208 }
1209
1210 pub fn validate_full(&self) -> Result<(), ArrowError> {
1215 self.validate_data()?;
1216 self.child_data
1218 .iter()
1219 .enumerate()
1220 .try_for_each(|(i, child_data)| {
1221 child_data.validate_full().map_err(|e| {
1222 ArrowError::InvalidArgumentError(format!(
1223 "{} child #{} invalid: {}",
1224 self.data_type, i, e
1225 ))
1226 })
1227 })?;
1228 Ok(())
1229 }
1230
1231 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1241 if let Some(nulls) = &self.nulls {
1242 let actual = nulls.len() - nulls.inner().count_set_bits();
1243 if actual != nulls.null_count() {
1244 return Err(ArrowError::InvalidArgumentError(format!(
1245 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1246 nulls.null_count(),
1247 actual
1248 )));
1249 }
1250 }
1251
1252 match &self.data_type {
1257 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1258 if !f.is_nullable() {
1259 self.validate_non_nullable(None, &self.child_data[0])?
1260 }
1261 }
1262 DataType::FixedSizeList(field, len) => {
1263 let child = &self.child_data[0];
1264 if !field.is_nullable() {
1265 match &self.nulls {
1266 Some(nulls) => {
1267 let element_len = *len as usize;
1268 let expanded = nulls.expand(element_len);
1269 self.validate_non_nullable(Some(&expanded), child)?;
1270 }
1271 None => self.validate_non_nullable(None, child)?,
1272 }
1273 }
1274 }
1275 DataType::Struct(fields) => {
1276 for (field, child) in fields.iter().zip(&self.child_data) {
1277 if !field.is_nullable() {
1278 self.validate_non_nullable(self.nulls(), child)?
1279 }
1280 }
1281 }
1282 _ => {}
1283 }
1284
1285 Ok(())
1286 }
1287
1288 fn validate_non_nullable(
1290 &self,
1291 mask: Option<&NullBuffer>,
1292 child: &ArrayData,
1293 ) -> Result<(), ArrowError> {
1294 let mask = match mask {
1295 Some(mask) => mask,
1296 None => {
1297 return match child.null_count() {
1298 0 => Ok(()),
1299 _ => Err(ArrowError::InvalidArgumentError(format!(
1300 "non-nullable child of type {} contains nulls not present in parent {}",
1301 child.data_type, self.data_type
1302 ))),
1303 };
1304 }
1305 };
1306
1307 match child.nulls() {
1308 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1309 "non-nullable child of type {} contains nulls not present in parent",
1310 child.data_type
1311 ))),
1312 _ => Ok(()),
1313 }
1314 }
1315
1316 pub fn validate_values(&self) -> Result<(), ArrowError> {
1322 match &self.data_type {
1323 DataType::Utf8 => self.validate_utf8::<i32>(),
1324 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1325 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1326 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1327 DataType::BinaryView => {
1328 let views = self.typed_buffer::<u128>(0, self.len)?;
1329 validate_binary_view(views, &self.buffers[1..])
1330 }
1331 DataType::Utf8View => {
1332 let views = self.typed_buffer::<u128>(0, self.len)?;
1333 validate_string_view(views, &self.buffers[1..])
1334 }
1335 DataType::List(_) | DataType::Map(_, _) => {
1336 let child = &self.child_data[0];
1337 self.validate_offsets_full::<i32>(child.len)
1338 }
1339 DataType::LargeList(_) => {
1340 let child = &self.child_data[0];
1341 self.validate_offsets_full::<i64>(child.len)
1342 }
1343 DataType::Union(_, _) => {
1344 Ok(())
1350 }
1351 DataType::Dictionary(key_type, _value_type) => {
1352 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1353 let max_value = dictionary_length - 1;
1354 match key_type.as_ref() {
1355 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1356 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1357 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1358 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1359 DataType::Int8 => self.check_bounds::<i8>(max_value),
1360 DataType::Int16 => self.check_bounds::<i16>(max_value),
1361 DataType::Int32 => self.check_bounds::<i32>(max_value),
1362 DataType::Int64 => self.check_bounds::<i64>(max_value),
1363 _ => unreachable!(),
1364 }
1365 }
1366 DataType::RunEndEncoded(run_ends, _values) => {
1367 let run_ends_data = self.child_data()[0].clone();
1368 match run_ends.data_type() {
1369 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1370 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1371 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1372 _ => unreachable!(),
1373 }
1374 }
1375 _ => {
1376 Ok(())
1378 }
1379 }
1380 }
1381
1382 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1393 where
1394 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1395 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1396 {
1397 self.typed_offsets::<T>()?
1398 .iter()
1399 .enumerate()
1400 .map(|(i, x)| {
1401 let r = x.to_usize().ok_or_else(|| {
1403 ArrowError::InvalidArgumentError(format!(
1404 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1405 );
1406 match r {
1408 Ok(n) if n <= offset_limit => Ok((i, n)),
1409 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1410 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1411 ),
1412 Err(e) => Err(e),
1413 }
1414 })
1415 .scan(0_usize, |start, end| {
1416 match end {
1418 Ok((i, end)) if *start <= end => {
1419 let range = Some(Ok((i, *start..end)));
1420 *start = end;
1421 range
1422 }
1423 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1424 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1425 i - 1, start, end))
1426 )),
1427 Err(err) => Some(Err(err)),
1428 }
1429 })
1430 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1432 let (item_index, range) = res?;
1433 validate(item_index-1, range)
1434 })
1435 }
1436
1437 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1440 where
1441 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1442 {
1443 let values_buffer = &self.buffers[1].as_slice();
1444 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1445 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1447 if !values_str.is_char_boundary(range.start)
1448 || !values_str.is_char_boundary(range.end)
1449 {
1450 return Err(ArrowError::InvalidArgumentError(format!(
1451 "incomplete utf-8 byte sequence from index {string_index}"
1452 )));
1453 }
1454 Ok(())
1455 })
1456 } else {
1457 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1459 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1460 ArrowError::InvalidArgumentError(format!(
1461 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1462 ))
1463 })?;
1464 Ok(())
1465 })
1466 }
1467 }
1468
1469 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1472 where
1473 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1474 {
1475 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1476 Ok(())
1479 })
1480 }
1481
1482 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1485 where
1486 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1487 {
1488 let required_len = self.len + self.offset;
1489 let buffer = &self.buffers[0];
1490
1491 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1494
1495 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1497
1498 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1499 if self.is_null(i) {
1501 return Ok(());
1502 }
1503 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1504 ArrowError::InvalidArgumentError(format!(
1505 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1506 ))
1507 })?;
1508
1509 if dict_index < 0 || dict_index > max_value {
1510 return Err(ArrowError::InvalidArgumentError(format!(
1511 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1512 )));
1513 }
1514 Ok(())
1515 })
1516 }
1517
1518 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1520 where
1521 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1522 {
1523 let values = self.typed_buffer::<T>(0, self.len)?;
1524 let mut prev_value: i64 = 0_i64;
1525 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1526 let value: i64 = inp_value.try_into().map_err(|_| {
1527 ArrowError::InvalidArgumentError(format!(
1528 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1529 ))
1530 })?;
1531 if value <= 0_i64 {
1532 return Err(ArrowError::InvalidArgumentError(format!(
1533 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1534 )));
1535 }
1536 if ix > 0 && value <= prev_value {
1537 return Err(ArrowError::InvalidArgumentError(format!(
1538 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1539 )));
1540 }
1541
1542 prev_value = value;
1543 Ok(())
1544 })?;
1545
1546 if prev_value.as_usize() < (self.offset + self.len) {
1547 return Err(ArrowError::InvalidArgumentError(format!(
1548 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1549 self.offset + self.len
1550 )));
1551 }
1552 Ok(())
1553 }
1554
1555 pub fn ptr_eq(&self, other: &Self) -> bool {
1559 if self.offset != other.offset
1560 || self.len != other.len
1561 || self.data_type != other.data_type
1562 || self.buffers.len() != other.buffers.len()
1563 || self.child_data.len() != other.child_data.len()
1564 {
1565 return false;
1566 }
1567
1568 match (&self.nulls, &other.nulls) {
1569 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1570 (Some(_), None) | (None, Some(_)) => return false,
1571 _ => {}
1572 };
1573
1574 if !self
1575 .buffers
1576 .iter()
1577 .zip(other.buffers.iter())
1578 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1579 {
1580 return false;
1581 }
1582
1583 self.child_data
1584 .iter()
1585 .zip(other.child_data.iter())
1586 .all(|(a, b)| a.ptr_eq(b))
1587 }
1588
1589 pub fn into_builder(self) -> ArrayDataBuilder {
1591 self.into()
1592 }
1593}
1594
1595pub fn layout(data_type: &DataType) -> DataTypeLayout {
1598 use arrow_schema::IntervalUnit::*;
1601
1602 match data_type {
1603 DataType::Null => DataTypeLayout {
1604 buffers: vec![],
1605 can_contain_null_mask: false,
1606 variadic: false,
1607 },
1608 DataType::Boolean => DataTypeLayout {
1609 buffers: vec![BufferSpec::BitMap],
1610 can_contain_null_mask: true,
1611 variadic: false,
1612 },
1613 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1614 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1615 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1616 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1617 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1618 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1619 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1620 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1621 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1622 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1623 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1624 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1625 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1626 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1627 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1628 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1629 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1630 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1631 DataType::Interval(MonthDayNano) => {
1632 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1633 }
1634 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1635 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1636 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1637 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1638 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1639 DataType::FixedSizeBinary(size) => {
1640 let spec = BufferSpec::FixedWidth {
1641 byte_width: (*size).try_into().unwrap(),
1642 alignment: mem::align_of::<u8>(),
1643 };
1644 DataTypeLayout {
1645 buffers: vec![spec],
1646 can_contain_null_mask: true,
1647 variadic: false,
1648 }
1649 }
1650 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1651 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1652 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1653 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1654 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1655 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1657 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1658 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1659 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1660 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1661 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1664 let type_ids = BufferSpec::FixedWidth {
1665 byte_width: mem::size_of::<i8>(),
1666 alignment: mem::align_of::<i8>(),
1667 };
1668
1669 DataTypeLayout {
1670 buffers: match mode {
1671 UnionMode::Sparse => {
1672 vec![type_ids]
1673 }
1674 UnionMode::Dense => {
1675 vec![
1676 type_ids,
1677 BufferSpec::FixedWidth {
1678 byte_width: mem::size_of::<i32>(),
1679 alignment: mem::align_of::<i32>(),
1680 },
1681 ]
1682 }
1683 },
1684 can_contain_null_mask: false,
1685 variadic: false,
1686 }
1687 }
1688 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1689 }
1690}
1691
1692#[derive(Debug, PartialEq, Eq)]
1694pub struct DataTypeLayout {
1696 pub buffers: Vec<BufferSpec>,
1698
1699 pub can_contain_null_mask: bool,
1701
1702 pub variadic: bool,
1706}
1707
1708impl DataTypeLayout {
1709 pub fn new_fixed_width<T>() -> Self {
1711 Self {
1712 buffers: vec![BufferSpec::FixedWidth {
1713 byte_width: mem::size_of::<T>(),
1714 alignment: mem::align_of::<T>(),
1715 }],
1716 can_contain_null_mask: true,
1717 variadic: false,
1718 }
1719 }
1720
1721 pub fn new_nullable_empty() -> Self {
1724 Self {
1725 buffers: vec![],
1726 can_contain_null_mask: true,
1727 variadic: false,
1728 }
1729 }
1730
1731 pub fn new_empty() -> Self {
1734 Self {
1735 buffers: vec![],
1736 can_contain_null_mask: false,
1737 variadic: false,
1738 }
1739 }
1740
1741 pub fn new_binary<T>() -> Self {
1745 Self {
1746 buffers: vec![
1747 BufferSpec::FixedWidth {
1749 byte_width: mem::size_of::<T>(),
1750 alignment: mem::align_of::<T>(),
1751 },
1752 BufferSpec::VariableWidth,
1754 ],
1755 can_contain_null_mask: true,
1756 variadic: false,
1757 }
1758 }
1759
1760 pub fn new_view() -> Self {
1762 Self {
1763 buffers: vec![BufferSpec::FixedWidth {
1764 byte_width: mem::size_of::<u128>(),
1765 alignment: mem::align_of::<u128>(),
1766 }],
1767 can_contain_null_mask: true,
1768 variadic: true,
1769 }
1770 }
1771
1772 pub fn new_list_view<T>() -> Self {
1774 Self {
1775 buffers: vec![
1776 BufferSpec::FixedWidth {
1777 byte_width: mem::size_of::<T>(),
1778 alignment: mem::align_of::<T>(),
1779 },
1780 BufferSpec::FixedWidth {
1781 byte_width: mem::size_of::<T>(),
1782 alignment: mem::align_of::<T>(),
1783 },
1784 ],
1785 can_contain_null_mask: true,
1786 variadic: true,
1787 }
1788 }
1789}
1790
1791#[derive(Debug, PartialEq, Eq)]
1793pub enum BufferSpec {
1794 FixedWidth {
1805 byte_width: usize,
1807 alignment: usize,
1809 },
1810 VariableWidth,
1812 BitMap,
1818 #[allow(dead_code)]
1821 AlwaysNull,
1822}
1823
1824impl PartialEq for ArrayData {
1825 fn eq(&self, other: &Self) -> bool {
1826 equal::equal(self, other)
1827 }
1828}
1829
1830#[derive(Debug, Clone)]
1849#[doc(hidden)]
1850pub struct UnsafeFlag(bool);
1851
1852impl UnsafeFlag {
1853 #[inline]
1857 pub const fn new() -> Self {
1858 Self(false)
1859 }
1860
1861 #[inline]
1871 pub unsafe fn set(&mut self, val: bool) {
1872 self.0 = val;
1873 }
1874
1875 #[inline]
1877 pub fn get(&self) -> bool {
1878 self.0
1879 }
1880}
1881
1882impl Default for UnsafeFlag {
1884 fn default() -> Self {
1885 Self::new()
1886 }
1887}
1888
1889#[derive(Debug)]
1891pub struct ArrayDataBuilder {
1892 data_type: DataType,
1893 len: usize,
1894 null_count: Option<usize>,
1895 null_bit_buffer: Option<Buffer>,
1896 nulls: Option<NullBuffer>,
1897 offset: usize,
1898 buffers: Vec<Buffer>,
1899 child_data: Vec<ArrayData>,
1900 align_buffers: bool,
1904 skip_validation: UnsafeFlag,
1914}
1915
1916impl ArrayDataBuilder {
1917 #[inline]
1918 pub const fn new(data_type: DataType) -> Self {
1920 Self {
1921 data_type,
1922 len: 0,
1923 null_count: None,
1924 null_bit_buffer: None,
1925 nulls: None,
1926 offset: 0,
1927 buffers: vec![],
1928 child_data: vec![],
1929 align_buffers: false,
1930 skip_validation: UnsafeFlag::new(),
1931 }
1932 }
1933
1934 pub fn data_type(self, data_type: DataType) -> Self {
1936 Self { data_type, ..self }
1937 }
1938
1939 #[inline]
1940 #[allow(clippy::len_without_is_empty)]
1941 pub const fn len(mut self, n: usize) -> Self {
1943 self.len = n;
1944 self
1945 }
1946
1947 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1949 self.nulls = nulls;
1950 self.null_count = None;
1951 self.null_bit_buffer = None;
1952 self
1953 }
1954
1955 pub fn null_count(mut self, null_count: usize) -> Self {
1957 self.null_count = Some(null_count);
1958 self
1959 }
1960
1961 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1963 self.nulls = None;
1964 self.null_bit_buffer = buf;
1965 self
1966 }
1967
1968 #[inline]
1970 pub const fn offset(mut self, n: usize) -> Self {
1971 self.offset = n;
1972 self
1973 }
1974
1975 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1977 self.buffers = v;
1978 self
1979 }
1980
1981 pub fn add_buffer(mut self, b: Buffer) -> Self {
1983 self.buffers.push(b);
1984 self
1985 }
1986
1987 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1989 self.buffers.extend(bs);
1990 self
1991 }
1992
1993 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1995 self.child_data = v;
1996 self
1997 }
1998
1999 pub fn add_child_data(mut self, r: ArrayData) -> Self {
2001 self.child_data.push(r);
2002 self
2003 }
2004
2005 pub unsafe fn build_unchecked(self) -> ArrayData {
2021 unsafe { self.skip_validation(true) }.build().unwrap()
2022 }
2023
2024 pub fn build(self) -> Result<ArrayData, ArrowError> {
2033 let Self {
2034 data_type,
2035 len,
2036 null_count,
2037 null_bit_buffer,
2038 nulls,
2039 offset,
2040 buffers,
2041 child_data,
2042 align_buffers,
2043 skip_validation,
2044 } = self;
2045
2046 let nulls = nulls
2047 .or_else(|| {
2048 let buffer = null_bit_buffer?;
2049 let buffer = BooleanBuffer::new(buffer, offset, len);
2050 Some(match null_count {
2051 Some(n) => {
2052 unsafe { NullBuffer::new_unchecked(buffer, n) }
2054 }
2055 None => NullBuffer::new(buffer),
2056 })
2057 })
2058 .filter(|b| b.null_count() != 0);
2059
2060 let mut data = ArrayData {
2061 data_type,
2062 len,
2063 offset,
2064 buffers,
2065 child_data,
2066 nulls,
2067 };
2068
2069 if align_buffers {
2070 data.align_buffers();
2071 }
2072
2073 if !skip_validation.get() || cfg!(feature = "force_validate") {
2075 data.validate_data()?;
2076 }
2077 Ok(data)
2078 }
2079
2080 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2082 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2083 self.align_buffers(true).build()
2084 }
2085
2086 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2102 self.align_buffers = align_buffers;
2103 self
2104 }
2105
2106 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2120 unsafe {
2121 self.skip_validation.set(skip_validation);
2122 }
2123 self
2124 }
2125}
2126
2127impl From<ArrayData> for ArrayDataBuilder {
2128 fn from(d: ArrayData) -> Self {
2129 Self {
2130 data_type: d.data_type,
2131 len: d.len,
2132 offset: d.offset,
2133 buffers: d.buffers,
2134 child_data: d.child_data,
2135 nulls: d.nulls,
2136 null_bit_buffer: None,
2137 null_count: None,
2138 align_buffers: false,
2139 skip_validation: UnsafeFlag::new(),
2140 }
2141 }
2142}
2143
2144#[cfg(test)]
2145mod tests {
2146 use super::*;
2147 use arrow_schema::{Field, Fields};
2148
2149 fn make_i32_buffer(n: usize) -> Buffer {
2153 Buffer::from_slice_ref(vec![42i32; n])
2154 }
2155
2156 fn make_f32_buffer(n: usize) -> Buffer {
2158 Buffer::from_slice_ref(vec![42f32; n])
2159 }
2160
2161 #[test]
2162 fn test_builder() {
2163 let v = (0..25).collect::<Vec<i32>>();
2165 let b1 = Buffer::from_slice_ref(&v);
2166 let arr_data = ArrayData::builder(DataType::Int32)
2167 .len(20)
2168 .offset(5)
2169 .add_buffer(b1)
2170 .null_bit_buffer(Some(Buffer::from([
2171 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2172 ])))
2173 .build()
2174 .unwrap();
2175
2176 assert_eq!(20, arr_data.len());
2177 assert_eq!(10, arr_data.null_count());
2178 assert_eq!(5, arr_data.offset());
2179 assert_eq!(1, arr_data.buffers().len());
2180 assert_eq!(
2181 Buffer::from_slice_ref(&v).as_slice(),
2182 arr_data.buffers()[0].as_slice()
2183 );
2184 }
2185
2186 #[test]
2187 fn test_builder_with_child_data() {
2188 let child_arr_data = ArrayData::try_new(
2189 DataType::Int32,
2190 5,
2191 None,
2192 0,
2193 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2194 vec![],
2195 )
2196 .unwrap();
2197
2198 let field = Arc::new(Field::new("x", DataType::Int32, true));
2199 let data_type = DataType::Struct(vec![field].into());
2200
2201 let arr_data = ArrayData::builder(data_type)
2202 .len(5)
2203 .offset(0)
2204 .add_child_data(child_arr_data.clone())
2205 .build()
2206 .unwrap();
2207
2208 assert_eq!(5, arr_data.len());
2209 assert_eq!(1, arr_data.child_data().len());
2210 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2211 }
2212
2213 #[test]
2214 fn test_null_count() {
2215 let mut bit_v: [u8; 2] = [0; 2];
2216 bit_util::set_bit(&mut bit_v, 0);
2217 bit_util::set_bit(&mut bit_v, 3);
2218 bit_util::set_bit(&mut bit_v, 10);
2219 let arr_data = ArrayData::builder(DataType::Int32)
2220 .len(16)
2221 .add_buffer(make_i32_buffer(16))
2222 .null_bit_buffer(Some(Buffer::from(bit_v)))
2223 .build()
2224 .unwrap();
2225 assert_eq!(13, arr_data.null_count());
2226
2227 let mut bit_v: [u8; 2] = [0; 2];
2229 bit_util::set_bit(&mut bit_v, 0);
2230 bit_util::set_bit(&mut bit_v, 3);
2231 bit_util::set_bit(&mut bit_v, 10);
2232 let arr_data = ArrayData::builder(DataType::Int32)
2233 .len(12)
2234 .offset(2)
2235 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2237 .build()
2238 .unwrap();
2239 assert_eq!(10, arr_data.null_count());
2240 }
2241
2242 #[test]
2243 fn test_null_buffer_ref() {
2244 let mut bit_v: [u8; 2] = [0; 2];
2245 bit_util::set_bit(&mut bit_v, 0);
2246 bit_util::set_bit(&mut bit_v, 3);
2247 bit_util::set_bit(&mut bit_v, 10);
2248 let arr_data = ArrayData::builder(DataType::Int32)
2249 .len(16)
2250 .add_buffer(make_i32_buffer(16))
2251 .null_bit_buffer(Some(Buffer::from(bit_v)))
2252 .build()
2253 .unwrap();
2254 assert!(arr_data.nulls().is_some());
2255 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2256 }
2257
2258 #[test]
2259 fn test_slice() {
2260 let mut bit_v: [u8; 2] = [0; 2];
2261 bit_util::set_bit(&mut bit_v, 0);
2262 bit_util::set_bit(&mut bit_v, 3);
2263 bit_util::set_bit(&mut bit_v, 10);
2264 let data = ArrayData::builder(DataType::Int32)
2265 .len(16)
2266 .add_buffer(make_i32_buffer(16))
2267 .null_bit_buffer(Some(Buffer::from(bit_v)))
2268 .build()
2269 .unwrap();
2270 let new_data = data.slice(1, 15);
2271 assert_eq!(data.len() - 1, new_data.len());
2272 assert_eq!(1, new_data.offset());
2273 assert_eq!(data.null_count(), new_data.null_count());
2274
2275 let new_data = new_data.slice(1, 14);
2277 assert_eq!(data.len() - 2, new_data.len());
2278 assert_eq!(2, new_data.offset());
2279 assert_eq!(data.null_count() - 1, new_data.null_count());
2280 }
2281
2282 #[test]
2283 fn test_equality() {
2284 let int_data = ArrayData::builder(DataType::Int32)
2285 .len(1)
2286 .add_buffer(make_i32_buffer(1))
2287 .build()
2288 .unwrap();
2289
2290 let float_data = ArrayData::builder(DataType::Float32)
2291 .len(1)
2292 .add_buffer(make_f32_buffer(1))
2293 .build()
2294 .unwrap();
2295 assert_ne!(int_data, float_data);
2296 assert!(!int_data.ptr_eq(&float_data));
2297 assert!(int_data.ptr_eq(&int_data));
2298
2299 #[allow(clippy::redundant_clone)]
2300 let int_data_clone = int_data.clone();
2301 assert_eq!(int_data, int_data_clone);
2302 assert!(int_data.ptr_eq(&int_data_clone));
2303 assert!(int_data_clone.ptr_eq(&int_data));
2304
2305 let int_data_slice = int_data_clone.slice(1, 0);
2306 assert!(int_data_slice.ptr_eq(&int_data_slice));
2307 assert!(!int_data.ptr_eq(&int_data_slice));
2308 assert!(!int_data_slice.ptr_eq(&int_data));
2309
2310 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2311 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2312 let string_data = ArrayData::try_new(
2313 DataType::Utf8,
2314 3,
2315 Some(Buffer::from_iter(vec![true, false, true])),
2316 0,
2317 vec![offsets_buffer, data_buffer],
2318 vec![],
2319 )
2320 .unwrap();
2321
2322 assert_ne!(float_data, string_data);
2323 assert!(!float_data.ptr_eq(&string_data));
2324
2325 assert!(string_data.ptr_eq(&string_data));
2326
2327 #[allow(clippy::redundant_clone)]
2328 let string_data_cloned = string_data.clone();
2329 assert!(string_data_cloned.ptr_eq(&string_data));
2330 assert!(string_data.ptr_eq(&string_data_cloned));
2331
2332 let string_data_slice = string_data.slice(1, 2);
2333 assert!(string_data_slice.ptr_eq(&string_data_slice));
2334 assert!(!string_data_slice.ptr_eq(&string_data))
2335 }
2336
2337 #[test]
2338 fn test_slice_memory_size() {
2339 let mut bit_v: [u8; 2] = [0; 2];
2340 bit_util::set_bit(&mut bit_v, 0);
2341 bit_util::set_bit(&mut bit_v, 3);
2342 bit_util::set_bit(&mut bit_v, 10);
2343 let data = ArrayData::builder(DataType::Int32)
2344 .len(16)
2345 .add_buffer(make_i32_buffer(16))
2346 .null_bit_buffer(Some(Buffer::from(bit_v)))
2347 .build()
2348 .unwrap();
2349 let new_data = data.slice(1, 14);
2350 assert_eq!(
2351 data.get_slice_memory_size().unwrap() - 8,
2352 new_data.get_slice_memory_size().unwrap()
2353 );
2354 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2355 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2356 let string_data = ArrayData::try_new(
2357 DataType::Utf8,
2358 3,
2359 Some(Buffer::from_iter(vec![true, false, true])),
2360 0,
2361 vec![offsets_buffer, data_buffer],
2362 vec![],
2363 )
2364 .unwrap();
2365 let string_data_slice = string_data.slice(1, 2);
2366 assert_eq!(
2368 string_data.get_slice_memory_size().unwrap() - 6,
2369 string_data_slice.get_slice_memory_size().unwrap()
2370 );
2371 }
2372
2373 #[test]
2374 fn test_count_nulls() {
2375 let buffer = Buffer::from([0b00010110, 0b10011111]);
2376 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2377 let count = count_nulls(Some(&buffer), 0, 16);
2378 assert_eq!(count, 7);
2379
2380 let count = count_nulls(Some(&buffer), 4, 8);
2381 assert_eq!(count, 3);
2382 }
2383
2384 #[test]
2385 fn test_contains_nulls() {
2386 let buffer: Buffer =
2387 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2388 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2389 assert!(contains_nulls(Some(&buffer), 0, 6));
2390 assert!(contains_nulls(Some(&buffer), 0, 3));
2391 assert!(!contains_nulls(Some(&buffer), 3, 2));
2392 assert!(!contains_nulls(Some(&buffer), 0, 0));
2393 }
2394
2395 #[test]
2396 fn test_alignment() {
2397 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2398 let sliced = buffer.slice(1);
2399
2400 let mut data = ArrayData {
2401 data_type: DataType::Int32,
2402 len: 0,
2403 offset: 0,
2404 buffers: vec![buffer],
2405 child_data: vec![],
2406 nulls: None,
2407 };
2408 data.validate_full().unwrap();
2409
2410 data.buffers[0] = sliced;
2412 let err = data.validate().unwrap_err();
2413
2414 assert_eq!(
2415 err.to_string(),
2416 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2417 );
2418
2419 data.align_buffers();
2420 data.validate_full().unwrap();
2421 }
2422
2423 #[test]
2424 fn test_alignment_struct() {
2425 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2426 let sliced = buffer.slice(1);
2427
2428 let child_data = ArrayData {
2429 data_type: DataType::Int32,
2430 len: 0,
2431 offset: 0,
2432 buffers: vec![buffer],
2433 child_data: vec![],
2434 nulls: None,
2435 };
2436
2437 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2438 let mut data = ArrayData {
2439 data_type: schema,
2440 len: 0,
2441 offset: 0,
2442 buffers: vec![],
2443 child_data: vec![child_data],
2444 nulls: None,
2445 };
2446 data.validate_full().unwrap();
2447
2448 data.child_data[0].buffers[0] = sliced;
2450 let err = data.validate().unwrap_err();
2451
2452 assert_eq!(
2453 err.to_string(),
2454 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2455 );
2456
2457 data.align_buffers();
2458 data.validate_full().unwrap();
2459 }
2460
2461 #[test]
2462 fn test_null_view_types() {
2463 let array_len = 32;
2464 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2465 assert_eq!(array.len(), array_len);
2466 for i in 0..array.len() {
2467 assert!(array.is_null(i));
2468 }
2469
2470 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2471 assert_eq!(array.len(), array_len);
2472 for i in 0..array.len() {
2473 assert!(array.is_null(i));
2474 }
2475 }
2476}