1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 unsafe { skip_validation.set(true) };
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
313 data_type: DataType,
314 len: usize,
315 null_bit_buffer: Option<Buffer>,
316 offset: usize,
317 buffers: Vec<Buffer>,
318 child_data: Vec<ArrayData>,
319 ) -> Result<Self, ArrowError> {
320 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324 let needed_len = bit_util::ceil(len + offset, 8);
325 if null_bit_buffer.len() < needed_len {
326 return Err(ArrowError::InvalidArgumentError(format!(
327 "null_bit_buffer size too small. got {} needed {}",
328 null_bit_buffer.len(),
329 needed_len
330 )));
331 }
332 }
333 let new_self = unsafe {
335 Self::new_unchecked(
336 data_type,
337 len,
338 None,
339 null_bit_buffer,
340 offset,
341 buffers,
342 child_data,
343 )
344 };
345
346 new_self.validate_data()?;
351 Ok(new_self)
352 }
353
354 #[inline]
356 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357 ArrayDataBuilder::new(data_type)
358 }
359
360 #[inline]
362 pub const fn data_type(&self) -> &DataType {
363 &self.data_type
364 }
365
366 pub fn buffers(&self) -> &[Buffer] {
368 &self.buffers
369 }
370
371 pub fn child_data(&self) -> &[ArrayData] {
374 &self.child_data[..]
375 }
376
377 #[inline]
379 pub fn is_null(&self, i: usize) -> bool {
380 match &self.nulls {
381 Some(v) => v.is_null(i),
382 None => false,
383 }
384 }
385
386 #[inline]
390 pub fn nulls(&self) -> Option<&NullBuffer> {
391 self.nulls.as_ref()
392 }
393
394 #[inline]
396 pub fn is_valid(&self, i: usize) -> bool {
397 !self.is_null(i)
398 }
399
400 #[inline]
402 pub const fn len(&self) -> usize {
403 self.len
404 }
405
406 #[inline]
408 pub const fn is_empty(&self) -> bool {
409 self.len == 0
410 }
411
412 #[inline]
414 pub const fn offset(&self) -> usize {
415 self.offset
416 }
417
418 #[inline]
420 pub fn null_count(&self) -> usize {
421 self.nulls
422 .as_ref()
423 .map(|x| x.null_count())
424 .unwrap_or_default()
425 }
426
427 pub fn get_buffer_memory_size(&self) -> usize {
439 let mut size = 0;
440 for buffer in &self.buffers {
441 size += buffer.capacity();
442 }
443 if let Some(bitmap) = &self.nulls {
444 size += bitmap.buffer().capacity()
445 }
446 for child in &self.child_data {
447 size += child.get_buffer_memory_size();
448 }
449 size
450 }
451
452 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465 let mut result: usize = 0;
466 let layout = layout(&self.data_type);
467
468 for spec in layout.buffers.iter() {
469 match spec {
470 BufferSpec::FixedWidth { byte_width, .. } => {
471 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472 ArrowError::ComputeError(
473 "Integer overflow computing buffer size".to_string(),
474 )
475 })?;
476 result += buffer_size;
477 }
478 BufferSpec::VariableWidth => {
479 let buffer_len = match self.data_type {
480 DataType::Utf8 | DataType::Binary => {
481 let offsets = self.typed_offsets::<i32>()?;
482 (offsets[self.len] - offsets[0]) as usize
483 }
484 DataType::LargeUtf8 | DataType::LargeBinary => {
485 let offsets = self.typed_offsets::<i64>()?;
486 (offsets[self.len] - offsets[0]) as usize
487 }
488 _ => {
489 return Err(ArrowError::NotYetImplemented(format!(
490 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491 self.data_type
492 )));
493 }
494 };
495 result += buffer_len;
496 }
497 BufferSpec::BitMap => {
498 let buffer_size = bit_util::ceil(self.len, 8);
499 result += buffer_size;
500 }
501 BufferSpec::AlwaysNull => {
502 }
504 }
505 }
506
507 if self.nulls().is_some() {
508 result += bit_util::ceil(self.len, 8);
509 }
510
511 for child in &self.child_data {
512 result += child.get_slice_memory_size()?;
513 }
514 Ok(result)
515 }
516
517 pub fn get_array_memory_size(&self) -> usize {
526 let mut size = mem::size_of_val(self);
527
528 for buffer in &self.buffers {
530 size += mem::size_of::<Buffer>();
531 size += buffer.capacity();
532 }
533 if let Some(nulls) = &self.nulls {
534 size += nulls.buffer().capacity();
535 }
536 for child in &self.child_data {
537 size += child.get_array_memory_size();
538 }
539
540 size
541 }
542
543 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551 assert!((offset + length) <= self.len());
552
553 if let DataType::Struct(_) = self.data_type() {
554 let new_offset = self.offset + offset;
556 ArrayData {
557 data_type: self.data_type().clone(),
558 len: length,
559 offset: new_offset,
560 buffers: self.buffers.clone(),
561 child_data: self
563 .child_data()
564 .iter()
565 .map(|data| data.slice(offset, length))
566 .collect(),
567 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568 }
569 } else {
570 let mut new_data = self.clone();
571
572 new_data.len = length;
573 new_data.offset = offset + self.offset;
574 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
575
576 new_data
577 }
578 }
579
580 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587 &self.buffers()[buffer].typed_data()[self.offset..]
588 }
589
590 pub fn new_null(data_type: &DataType, len: usize) -> Self {
592 let bit_len = bit_util::ceil(len, 8);
593 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596 Some(width) => (vec![zeroed(width * len)], vec![], true),
597 None => match data_type {
598 DataType::Null => (vec![], vec![], false),
599 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600 DataType::Binary | DataType::Utf8 => {
601 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602 }
603 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604 DataType::LargeBinary | DataType::LargeUtf8 => {
605 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606 }
607 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608 DataType::List(f) | DataType::Map(f, _) => (
609 vec![zeroed((len + 1) * 4)],
610 vec![ArrayData::new_empty(f.data_type())],
611 true,
612 ),
613 DataType::LargeList(f) => (
614 vec![zeroed((len + 1) * 8)],
615 vec![ArrayData::new_empty(f.data_type())],
616 true,
617 ),
618 DataType::FixedSizeList(f, list_len) => (
619 vec![],
620 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
621 true,
622 ),
623 DataType::Struct(fields) => (
624 vec![],
625 fields
626 .iter()
627 .map(|f| Self::new_null(f.data_type(), len))
628 .collect(),
629 true,
630 ),
631 DataType::Dictionary(k, v) => (
632 vec![zeroed(k.primitive_width().unwrap() * len)],
633 vec![ArrayData::new_empty(v.as_ref())],
634 true,
635 ),
636 DataType::Union(f, mode) => {
637 let (id, _) = f.iter().next().unwrap();
638 let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
639 let buffers = match mode {
640 UnionMode::Sparse => vec![ids],
641 UnionMode::Dense => {
642 let end_offset = i32::from_usize(len).unwrap();
643 vec![ids, Buffer::from_iter(0_i32..end_offset)]
644 }
645 };
646
647 let children = f
648 .iter()
649 .enumerate()
650 .map(|(idx, (_, f))| {
651 if idx == 0 || *mode == UnionMode::Sparse {
652 Self::new_null(f.data_type(), len)
653 } else {
654 Self::new_empty(f.data_type())
655 }
656 })
657 .collect();
658
659 (buffers, children, false)
660 }
661 DataType::RunEndEncoded(r, v) => {
662 let runs = match r.data_type() {
663 DataType::Int16 => {
664 let i = i16::from_usize(len).expect("run overflow");
665 Buffer::from_slice_ref([i])
666 }
667 DataType::Int32 => {
668 let i = i32::from_usize(len).expect("run overflow");
669 Buffer::from_slice_ref([i])
670 }
671 DataType::Int64 => {
672 let i = i64::from_usize(len).expect("run overflow");
673 Buffer::from_slice_ref([i])
674 }
675 dt => unreachable!("Invalid run ends data type {dt}"),
676 };
677
678 let builder = ArrayData::builder(r.data_type().clone())
679 .len(1)
680 .buffers(vec![runs]);
681
682 let runs = unsafe { builder.build_unchecked() };
685 (
686 vec![],
687 vec![runs, ArrayData::new_null(v.data_type(), 1)],
688 false,
689 )
690 }
691 d => unreachable!("{d}"),
692 },
693 };
694
695 let mut builder = ArrayDataBuilder::new(data_type.clone())
696 .len(len)
697 .buffers(buffers)
698 .child_data(child_data);
699
700 if has_nulls {
701 builder = builder.nulls(Some(NullBuffer::new_null(len)))
702 }
703
704 unsafe { builder.build_unchecked() }
707 }
708
709 pub fn new_empty(data_type: &DataType) -> Self {
711 Self::new_null(data_type, 0)
712 }
713
714 pub fn align_buffers(&mut self) {
723 let layout = layout(&self.data_type);
724 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
725 if let BufferSpec::FixedWidth { alignment, .. } = spec {
726 if buffer.as_ptr().align_offset(*alignment) != 0 {
727 *buffer = Buffer::from_slice_ref(buffer.as_ref());
728 }
729 }
730 }
731 for data in self.child_data.iter_mut() {
733 data.align_buffers()
734 }
735 }
736
737 pub fn validate(&self) -> Result<(), ArrowError> {
748 let len_plus_offset = self.len + self.offset;
750
751 let layout = layout(&self.data_type);
753
754 if !layout.can_contain_null_mask && self.nulls.is_some() {
755 return Err(ArrowError::InvalidArgumentError(format!(
756 "Arrays of type {:?} cannot contain a null bitmask",
757 self.data_type,
758 )));
759 }
760
761 if self.buffers.len() < layout.buffers.len()
763 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
764 {
765 return Err(ArrowError::InvalidArgumentError(format!(
766 "Expected {} buffers in array of type {:?}, got {}",
767 layout.buffers.len(),
768 self.data_type,
769 self.buffers.len(),
770 )));
771 }
772
773 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
774 match spec {
775 BufferSpec::FixedWidth {
776 byte_width,
777 alignment,
778 } => {
779 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
780
781 if buffer.len() < min_buffer_size {
782 return Err(ArrowError::InvalidArgumentError(format!(
783 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
784 min_buffer_size,
785 i,
786 self.data_type,
787 buffer.len()
788 )));
789 }
790
791 let align_offset = buffer.as_ptr().align_offset(*alignment);
792 if align_offset != 0 {
793 return Err(ArrowError::InvalidArgumentError(format!(
794 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795 self.data_type,
796 align_offset.min(alignment - align_offset)
797 )));
798 }
799 }
800 BufferSpec::VariableWidth => {
801 }
805 BufferSpec::BitMap => {
806 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
807 if buffer.len() < min_buffer_size {
808 return Err(ArrowError::InvalidArgumentError(format!(
809 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
810 min_buffer_size,
811 i,
812 self.data_type,
813 buffer.len()
814 )));
815 }
816 }
817 BufferSpec::AlwaysNull => {
818 }
820 }
821 }
822
823 if let Some(nulls) = self.nulls() {
825 if nulls.null_count() > self.len {
826 return Err(ArrowError::InvalidArgumentError(format!(
827 "null_count {} for an array exceeds length of {} elements",
828 nulls.null_count(),
829 self.len
830 )));
831 }
832
833 let actual_len = nulls.validity().len();
834 let needed_len = bit_util::ceil(len_plus_offset, 8);
835 if actual_len < needed_len {
836 return Err(ArrowError::InvalidArgumentError(format!(
837 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
838 )));
839 }
840
841 if nulls.len() != self.len {
842 return Err(ArrowError::InvalidArgumentError(format!(
843 "null buffer incorrect size. got {} expected {}",
844 nulls.len(),
845 self.len
846 )));
847 }
848 }
849
850 self.validate_child_data()?;
851
852 match &self.data_type {
854 DataType::Utf8 | DataType::Binary => {
855 self.validate_offsets::<i32>(self.buffers[1].len())?;
856 }
857 DataType::LargeUtf8 | DataType::LargeBinary => {
858 self.validate_offsets::<i64>(self.buffers[1].len())?;
859 }
860 DataType::Dictionary(key_type, _value_type) => {
861 if !DataType::is_dictionary_key_type(key_type) {
863 return Err(ArrowError::InvalidArgumentError(format!(
864 "Dictionary key type must be integer, but was {key_type}"
865 )));
866 }
867 }
868 DataType::RunEndEncoded(run_ends_type, _) => {
869 if run_ends_type.is_nullable() {
870 return Err(ArrowError::InvalidArgumentError(
871 "The nullable should be set to false for the field defining run_ends array.".to_string()
872 ));
873 }
874 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
875 return Err(ArrowError::InvalidArgumentError(format!(
876 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
877 run_ends_type.data_type()
878 )));
879 }
880 }
881 _ => {}
882 };
883
884 Ok(())
885 }
886
887 fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
894 if self.len == 0 && self.buffers[0].is_empty() {
896 return Ok(&[]);
897 }
898
899 self.typed_buffer(0, self.len + 1)
900 }
901
902 fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
904 &self,
905 idx: usize,
906 len: usize,
907 ) -> Result<&[T], ArrowError> {
908 let buffer = &self.buffers[idx];
909
910 let required_len = (len + self.offset) * mem::size_of::<T>();
911
912 if buffer.len() < required_len {
913 return Err(ArrowError::InvalidArgumentError(format!(
914 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
915 idx,
916 self.data_type,
917 required_len,
918 buffer.len()
919 )));
920 }
921
922 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
923 }
924
925 fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
928 &self,
929 values_length: usize,
930 ) -> Result<(), ArrowError> {
931 let offsets = self.typed_offsets::<T>()?;
933 if offsets.is_empty() {
934 return Ok(());
935 }
936
937 let first_offset = offsets[0].to_usize().ok_or_else(|| {
938 ArrowError::InvalidArgumentError(format!(
939 "Error converting offset[0] ({}) to usize for {}",
940 offsets[0], self.data_type
941 ))
942 })?;
943
944 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
945 ArrowError::InvalidArgumentError(format!(
946 "Error converting offset[{}] ({}) to usize for {}",
947 self.len, offsets[self.len], self.data_type
948 ))
949 })?;
950
951 if first_offset > values_length {
952 return Err(ArrowError::InvalidArgumentError(format!(
953 "First offset {} of {} is larger than values length {}",
954 first_offset, self.data_type, values_length,
955 )));
956 }
957
958 if last_offset > values_length {
959 return Err(ArrowError::InvalidArgumentError(format!(
960 "Last offset {} of {} is larger than values length {}",
961 last_offset, self.data_type, values_length,
962 )));
963 }
964
965 if first_offset > last_offset {
966 return Err(ArrowError::InvalidArgumentError(format!(
967 "First offset {} in {} is smaller than last offset {}",
968 first_offset, self.data_type, last_offset,
969 )));
970 }
971
972 Ok(())
973 }
974
975 fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
978 &self,
979 values_length: usize,
980 ) -> Result<(), ArrowError> {
981 let offsets: &[T] = self.typed_buffer(0, self.len)?;
982 let sizes: &[T] = self.typed_buffer(1, self.len)?;
983 for i in 0..values_length {
984 let size = sizes[i].to_usize().ok_or_else(|| {
985 ArrowError::InvalidArgumentError(format!(
986 "Error converting size[{}] ({}) to usize for {}",
987 i, sizes[i], self.data_type
988 ))
989 })?;
990 let offset = offsets[i].to_usize().ok_or_else(|| {
991 ArrowError::InvalidArgumentError(format!(
992 "Error converting offset[{}] ({}) to usize for {}",
993 i, offsets[i], self.data_type
994 ))
995 })?;
996 if size
997 .checked_add(offset)
998 .expect("Offset and size have exceeded the usize boundary")
999 > values_length
1000 {
1001 return Err(ArrowError::InvalidArgumentError(format!(
1002 "Size {} at index {} is larger than the remaining values for {}",
1003 size, i, self.data_type
1004 )));
1005 }
1006 }
1007 Ok(())
1008 }
1009
1010 fn validate_child_data(&self) -> Result<(), ArrowError> {
1012 match &self.data_type {
1013 DataType::List(field) | DataType::Map(field, _) => {
1014 let values_data = self.get_single_valid_child_data(field.data_type())?;
1015 self.validate_offsets::<i32>(values_data.len)?;
1016 Ok(())
1017 }
1018 DataType::LargeList(field) => {
1019 let values_data = self.get_single_valid_child_data(field.data_type())?;
1020 self.validate_offsets::<i64>(values_data.len)?;
1021 Ok(())
1022 }
1023 DataType::ListView(field) => {
1024 let values_data = self.get_single_valid_child_data(field.data_type())?;
1025 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1026 Ok(())
1027 }
1028 DataType::LargeListView(field) => {
1029 let values_data = self.get_single_valid_child_data(field.data_type())?;
1030 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1031 Ok(())
1032 }
1033 DataType::FixedSizeList(field, list_size) => {
1034 let values_data = self.get_single_valid_child_data(field.data_type())?;
1035
1036 let list_size: usize = (*list_size).try_into().map_err(|_| {
1037 ArrowError::InvalidArgumentError(format!(
1038 "{} has a negative list_size {}",
1039 self.data_type, list_size
1040 ))
1041 })?;
1042
1043 let expected_values_len = self.len
1044 .checked_mul(list_size)
1045 .expect("integer overflow computing expected number of expected values in FixedListSize");
1046
1047 if values_data.len < expected_values_len {
1048 return Err(ArrowError::InvalidArgumentError(format!(
1049 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1050 values_data.len, self.len, list_size, self.data_type
1051 )));
1052 }
1053
1054 Ok(())
1055 }
1056 DataType::Struct(fields) => {
1057 self.validate_num_child_data(fields.len())?;
1058 for (i, field) in fields.iter().enumerate() {
1059 let field_data = self.get_valid_child_data(i, field.data_type())?;
1060
1061 if field_data.len < self.len {
1063 return Err(ArrowError::InvalidArgumentError(format!(
1064 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1065 self.data_type,
1066 i,
1067 field.name(),
1068 field_data.len,
1069 self.len
1070 )));
1071 }
1072 }
1073 Ok(())
1074 }
1075 DataType::RunEndEncoded(run_ends_field, values_field) => {
1076 self.validate_num_child_data(2)?;
1077 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1078 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1079 if run_ends_data.len != values_data.len {
1080 return Err(ArrowError::InvalidArgumentError(format!(
1081 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1082 run_ends_data.len, values_data.len
1083 )));
1084 }
1085 if run_ends_data.nulls.is_some() {
1086 return Err(ArrowError::InvalidArgumentError(
1087 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1088 ));
1089 }
1090 Ok(())
1091 }
1092 DataType::Union(fields, mode) => {
1093 self.validate_num_child_data(fields.len())?;
1094
1095 for (i, (_, field)) in fields.iter().enumerate() {
1096 let field_data = self.get_valid_child_data(i, field.data_type())?;
1097
1098 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1099 return Err(ArrowError::InvalidArgumentError(format!(
1100 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1101 i,
1102 field_data.len,
1103 self.len + self.offset
1104 )));
1105 }
1106 }
1107 Ok(())
1108 }
1109 DataType::Dictionary(_key_type, value_type) => {
1110 self.get_single_valid_child_data(value_type)?;
1111 Ok(())
1112 }
1113 _ => {
1114 if !self.child_data.is_empty() {
1116 return Err(ArrowError::InvalidArgumentError(format!(
1117 "Expected no child arrays for type {} but got {}",
1118 self.data_type,
1119 self.child_data.len()
1120 )));
1121 }
1122 Ok(())
1123 }
1124 }
1125 }
1126
1127 fn get_single_valid_child_data(
1131 &self,
1132 expected_type: &DataType,
1133 ) -> Result<&ArrayData, ArrowError> {
1134 self.validate_num_child_data(1)?;
1135 self.get_valid_child_data(0, expected_type)
1136 }
1137
1138 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1140 if self.child_data.len() != expected_len {
1141 Err(ArrowError::InvalidArgumentError(format!(
1142 "Value data for {} should contain {} child data array(s), had {}",
1143 self.data_type,
1144 expected_len,
1145 self.child_data.len()
1146 )))
1147 } else {
1148 Ok(())
1149 }
1150 }
1151
1152 fn get_valid_child_data(
1155 &self,
1156 i: usize,
1157 expected_type: &DataType,
1158 ) -> Result<&ArrayData, ArrowError> {
1159 let values_data = self.child_data.get(i).ok_or_else(|| {
1160 ArrowError::InvalidArgumentError(format!(
1161 "{} did not have enough child arrays. Expected at least {} but had only {}",
1162 self.data_type,
1163 i + 1,
1164 self.child_data.len()
1165 ))
1166 })?;
1167
1168 if expected_type != &values_data.data_type {
1169 return Err(ArrowError::InvalidArgumentError(format!(
1170 "Child type mismatch for {}. Expected {} but child data had {}",
1171 self.data_type, expected_type, values_data.data_type
1172 )));
1173 }
1174
1175 values_data.validate()?;
1176 Ok(values_data)
1177 }
1178
1179 pub fn validate_data(&self) -> Result<(), ArrowError> {
1195 self.validate()?;
1196
1197 self.validate_nulls()?;
1198 self.validate_values()?;
1199 Ok(())
1200 }
1201
1202 pub fn validate_full(&self) -> Result<(), ArrowError> {
1207 self.validate_data()?;
1208 self.child_data
1210 .iter()
1211 .enumerate()
1212 .try_for_each(|(i, child_data)| {
1213 child_data.validate_full().map_err(|e| {
1214 ArrowError::InvalidArgumentError(format!(
1215 "{} child #{} invalid: {}",
1216 self.data_type, i, e
1217 ))
1218 })
1219 })?;
1220 Ok(())
1221 }
1222
1223 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1233 if let Some(nulls) = &self.nulls {
1234 let actual = nulls.len() - nulls.inner().count_set_bits();
1235 if actual != nulls.null_count() {
1236 return Err(ArrowError::InvalidArgumentError(format!(
1237 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1238 nulls.null_count(),
1239 actual
1240 )));
1241 }
1242 }
1243
1244 match &self.data_type {
1249 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1250 if !f.is_nullable() {
1251 self.validate_non_nullable(None, &self.child_data[0])?
1252 }
1253 }
1254 DataType::FixedSizeList(field, len) => {
1255 let child = &self.child_data[0];
1256 if !field.is_nullable() {
1257 match &self.nulls {
1258 Some(nulls) => {
1259 let element_len = *len as usize;
1260 let expanded = nulls.expand(element_len);
1261 self.validate_non_nullable(Some(&expanded), child)?;
1262 }
1263 None => self.validate_non_nullable(None, child)?,
1264 }
1265 }
1266 }
1267 DataType::Struct(fields) => {
1268 for (field, child) in fields.iter().zip(&self.child_data) {
1269 if !field.is_nullable() {
1270 self.validate_non_nullable(self.nulls(), child)?
1271 }
1272 }
1273 }
1274 _ => {}
1275 }
1276
1277 Ok(())
1278 }
1279
1280 fn validate_non_nullable(
1282 &self,
1283 mask: Option<&NullBuffer>,
1284 child: &ArrayData,
1285 ) -> Result<(), ArrowError> {
1286 let mask = match mask {
1287 Some(mask) => mask,
1288 None => {
1289 return match child.null_count() {
1290 0 => Ok(()),
1291 _ => Err(ArrowError::InvalidArgumentError(format!(
1292 "non-nullable child of type {} contains nulls not present in parent {}",
1293 child.data_type, self.data_type
1294 ))),
1295 };
1296 }
1297 };
1298
1299 match child.nulls() {
1300 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1301 "non-nullable child of type {} contains nulls not present in parent",
1302 child.data_type
1303 ))),
1304 _ => Ok(()),
1305 }
1306 }
1307
1308 pub fn validate_values(&self) -> Result<(), ArrowError> {
1314 match &self.data_type {
1315 DataType::Utf8 => self.validate_utf8::<i32>(),
1316 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1317 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1318 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1319 DataType::BinaryView => {
1320 let views = self.typed_buffer::<u128>(0, self.len)?;
1321 validate_binary_view(views, &self.buffers[1..])
1322 }
1323 DataType::Utf8View => {
1324 let views = self.typed_buffer::<u128>(0, self.len)?;
1325 validate_string_view(views, &self.buffers[1..])
1326 }
1327 DataType::List(_) | DataType::Map(_, _) => {
1328 let child = &self.child_data[0];
1329 self.validate_offsets_full::<i32>(child.len)
1330 }
1331 DataType::LargeList(_) => {
1332 let child = &self.child_data[0];
1333 self.validate_offsets_full::<i64>(child.len)
1334 }
1335 DataType::Union(_, _) => {
1336 Ok(())
1342 }
1343 DataType::Dictionary(key_type, _value_type) => {
1344 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1345 let max_value = dictionary_length - 1;
1346 match key_type.as_ref() {
1347 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1348 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1349 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1350 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1351 DataType::Int8 => self.check_bounds::<i8>(max_value),
1352 DataType::Int16 => self.check_bounds::<i16>(max_value),
1353 DataType::Int32 => self.check_bounds::<i32>(max_value),
1354 DataType::Int64 => self.check_bounds::<i64>(max_value),
1355 _ => unreachable!(),
1356 }
1357 }
1358 DataType::RunEndEncoded(run_ends, _values) => {
1359 let run_ends_data = self.child_data()[0].clone();
1360 match run_ends.data_type() {
1361 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1362 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1363 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1364 _ => unreachable!(),
1365 }
1366 }
1367 _ => {
1368 Ok(())
1370 }
1371 }
1372 }
1373
1374 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1385 where
1386 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1387 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1388 {
1389 self.typed_offsets::<T>()?
1390 .iter()
1391 .enumerate()
1392 .map(|(i, x)| {
1393 let r = x.to_usize().ok_or_else(|| {
1395 ArrowError::InvalidArgumentError(format!(
1396 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1397 );
1398 match r {
1400 Ok(n) if n <= offset_limit => Ok((i, n)),
1401 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1402 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1403 ),
1404 Err(e) => Err(e),
1405 }
1406 })
1407 .scan(0_usize, |start, end| {
1408 match end {
1410 Ok((i, end)) if *start <= end => {
1411 let range = Some(Ok((i, *start..end)));
1412 *start = end;
1413 range
1414 }
1415 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1416 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1417 i - 1, start, end))
1418 )),
1419 Err(err) => Some(Err(err)),
1420 }
1421 })
1422 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1424 let (item_index, range) = res?;
1425 validate(item_index-1, range)
1426 })
1427 }
1428
1429 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1432 where
1433 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1434 {
1435 let values_buffer = &self.buffers[1].as_slice();
1436 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1437 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1439 if !values_str.is_char_boundary(range.start)
1440 || !values_str.is_char_boundary(range.end)
1441 {
1442 return Err(ArrowError::InvalidArgumentError(format!(
1443 "incomplete utf-8 byte sequence from index {string_index}"
1444 )));
1445 }
1446 Ok(())
1447 })
1448 } else {
1449 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1451 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1452 ArrowError::InvalidArgumentError(format!(
1453 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1454 ))
1455 })?;
1456 Ok(())
1457 })
1458 }
1459 }
1460
1461 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1464 where
1465 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1466 {
1467 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1468 Ok(())
1471 })
1472 }
1473
1474 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1477 where
1478 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1479 {
1480 let required_len = self.len + self.offset;
1481 let buffer = &self.buffers[0];
1482
1483 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1486
1487 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1489
1490 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1491 if self.is_null(i) {
1493 return Ok(());
1494 }
1495 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1496 ArrowError::InvalidArgumentError(format!(
1497 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1498 ))
1499 })?;
1500
1501 if dict_index < 0 || dict_index > max_value {
1502 return Err(ArrowError::InvalidArgumentError(format!(
1503 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1504 )));
1505 }
1506 Ok(())
1507 })
1508 }
1509
1510 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1512 where
1513 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1514 {
1515 let values = self.typed_buffer::<T>(0, self.len)?;
1516 let mut prev_value: i64 = 0_i64;
1517 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1518 let value: i64 = inp_value.try_into().map_err(|_| {
1519 ArrowError::InvalidArgumentError(format!(
1520 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1521 ))
1522 })?;
1523 if value <= 0_i64 {
1524 return Err(ArrowError::InvalidArgumentError(format!(
1525 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1526 )));
1527 }
1528 if ix > 0 && value <= prev_value {
1529 return Err(ArrowError::InvalidArgumentError(format!(
1530 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1531 )));
1532 }
1533
1534 prev_value = value;
1535 Ok(())
1536 })?;
1537
1538 if prev_value.as_usize() < (self.offset + self.len) {
1539 return Err(ArrowError::InvalidArgumentError(format!(
1540 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1541 self.offset + self.len
1542 )));
1543 }
1544 Ok(())
1545 }
1546
1547 pub fn ptr_eq(&self, other: &Self) -> bool {
1551 if self.offset != other.offset
1552 || self.len != other.len
1553 || self.data_type != other.data_type
1554 || self.buffers.len() != other.buffers.len()
1555 || self.child_data.len() != other.child_data.len()
1556 {
1557 return false;
1558 }
1559
1560 match (&self.nulls, &other.nulls) {
1561 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1562 (Some(_), None) | (None, Some(_)) => return false,
1563 _ => {}
1564 };
1565
1566 if !self
1567 .buffers
1568 .iter()
1569 .zip(other.buffers.iter())
1570 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1571 {
1572 return false;
1573 }
1574
1575 self.child_data
1576 .iter()
1577 .zip(other.child_data.iter())
1578 .all(|(a, b)| a.ptr_eq(b))
1579 }
1580
1581 pub fn into_builder(self) -> ArrayDataBuilder {
1583 self.into()
1584 }
1585}
1586
1587pub fn layout(data_type: &DataType) -> DataTypeLayout {
1590 use arrow_schema::IntervalUnit::*;
1593
1594 match data_type {
1595 DataType::Null => DataTypeLayout {
1596 buffers: vec![],
1597 can_contain_null_mask: false,
1598 variadic: false,
1599 },
1600 DataType::Boolean => DataTypeLayout {
1601 buffers: vec![BufferSpec::BitMap],
1602 can_contain_null_mask: true,
1603 variadic: false,
1604 },
1605 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1606 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1607 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1608 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1609 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1610 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1611 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1612 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1613 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1614 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1615 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1616 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1617 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1618 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1619 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1620 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1621 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1622 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1623 DataType::Interval(MonthDayNano) => {
1624 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1625 }
1626 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1627 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1628 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1629 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1630 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1631 DataType::FixedSizeBinary(size) => {
1632 let spec = BufferSpec::FixedWidth {
1633 byte_width: (*size).try_into().unwrap(),
1634 alignment: mem::align_of::<u8>(),
1635 };
1636 DataTypeLayout {
1637 buffers: vec![spec],
1638 can_contain_null_mask: true,
1639 variadic: false,
1640 }
1641 }
1642 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1643 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1644 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1645 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1646 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1647 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1649 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1650 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1651 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1652 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1653 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1656 let type_ids = BufferSpec::FixedWidth {
1657 byte_width: mem::size_of::<i8>(),
1658 alignment: mem::align_of::<i8>(),
1659 };
1660
1661 DataTypeLayout {
1662 buffers: match mode {
1663 UnionMode::Sparse => {
1664 vec![type_ids]
1665 }
1666 UnionMode::Dense => {
1667 vec![
1668 type_ids,
1669 BufferSpec::FixedWidth {
1670 byte_width: mem::size_of::<i32>(),
1671 alignment: mem::align_of::<i32>(),
1672 },
1673 ]
1674 }
1675 },
1676 can_contain_null_mask: false,
1677 variadic: false,
1678 }
1679 }
1680 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1681 }
1682}
1683
1684#[derive(Debug, PartialEq, Eq)]
1686pub struct DataTypeLayout {
1688 pub buffers: Vec<BufferSpec>,
1690
1691 pub can_contain_null_mask: bool,
1693
1694 pub variadic: bool,
1698}
1699
1700impl DataTypeLayout {
1701 pub fn new_fixed_width<T>() -> Self {
1703 Self {
1704 buffers: vec![BufferSpec::FixedWidth {
1705 byte_width: mem::size_of::<T>(),
1706 alignment: mem::align_of::<T>(),
1707 }],
1708 can_contain_null_mask: true,
1709 variadic: false,
1710 }
1711 }
1712
1713 pub fn new_nullable_empty() -> Self {
1716 Self {
1717 buffers: vec![],
1718 can_contain_null_mask: true,
1719 variadic: false,
1720 }
1721 }
1722
1723 pub fn new_empty() -> Self {
1726 Self {
1727 buffers: vec![],
1728 can_contain_null_mask: false,
1729 variadic: false,
1730 }
1731 }
1732
1733 pub fn new_binary<T>() -> Self {
1737 Self {
1738 buffers: vec![
1739 BufferSpec::FixedWidth {
1741 byte_width: mem::size_of::<T>(),
1742 alignment: mem::align_of::<T>(),
1743 },
1744 BufferSpec::VariableWidth,
1746 ],
1747 can_contain_null_mask: true,
1748 variadic: false,
1749 }
1750 }
1751
1752 pub fn new_view() -> Self {
1754 Self {
1755 buffers: vec![BufferSpec::FixedWidth {
1756 byte_width: mem::size_of::<u128>(),
1757 alignment: mem::align_of::<u128>(),
1758 }],
1759 can_contain_null_mask: true,
1760 variadic: true,
1761 }
1762 }
1763
1764 pub fn new_list_view<T>() -> Self {
1766 Self {
1767 buffers: vec![
1768 BufferSpec::FixedWidth {
1769 byte_width: mem::size_of::<T>(),
1770 alignment: mem::align_of::<T>(),
1771 },
1772 BufferSpec::FixedWidth {
1773 byte_width: mem::size_of::<T>(),
1774 alignment: mem::align_of::<T>(),
1775 },
1776 ],
1777 can_contain_null_mask: true,
1778 variadic: true,
1779 }
1780 }
1781}
1782
1783#[derive(Debug, PartialEq, Eq)]
1785pub enum BufferSpec {
1786 FixedWidth {
1797 byte_width: usize,
1799 alignment: usize,
1801 },
1802 VariableWidth,
1804 BitMap,
1810 #[allow(dead_code)]
1813 AlwaysNull,
1814}
1815
1816impl PartialEq for ArrayData {
1817 fn eq(&self, other: &Self) -> bool {
1818 equal::equal(self, other)
1819 }
1820}
1821
1822#[derive(Debug, Clone)]
1841#[doc(hidden)]
1842pub struct UnsafeFlag(bool);
1843
1844impl UnsafeFlag {
1845 #[inline]
1849 pub const fn new() -> Self {
1850 Self(false)
1851 }
1852
1853 #[inline]
1863 pub unsafe fn set(&mut self, val: bool) {
1864 self.0 = val;
1865 }
1866
1867 #[inline]
1869 pub fn get(&self) -> bool {
1870 self.0
1871 }
1872}
1873
1874impl Default for UnsafeFlag {
1876 fn default() -> Self {
1877 Self::new()
1878 }
1879}
1880
1881#[derive(Debug)]
1883pub struct ArrayDataBuilder {
1884 data_type: DataType,
1885 len: usize,
1886 null_count: Option<usize>,
1887 null_bit_buffer: Option<Buffer>,
1888 nulls: Option<NullBuffer>,
1889 offset: usize,
1890 buffers: Vec<Buffer>,
1891 child_data: Vec<ArrayData>,
1892 align_buffers: bool,
1896 skip_validation: UnsafeFlag,
1906}
1907
1908impl ArrayDataBuilder {
1909 #[inline]
1910 pub const fn new(data_type: DataType) -> Self {
1912 Self {
1913 data_type,
1914 len: 0,
1915 null_count: None,
1916 null_bit_buffer: None,
1917 nulls: None,
1918 offset: 0,
1919 buffers: vec![],
1920 child_data: vec![],
1921 align_buffers: false,
1922 skip_validation: UnsafeFlag::new(),
1923 }
1924 }
1925
1926 pub fn data_type(self, data_type: DataType) -> Self {
1928 Self { data_type, ..self }
1929 }
1930
1931 #[inline]
1932 #[allow(clippy::len_without_is_empty)]
1933 pub const fn len(mut self, n: usize) -> Self {
1935 self.len = n;
1936 self
1937 }
1938
1939 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1941 self.nulls = nulls;
1942 self.null_count = None;
1943 self.null_bit_buffer = None;
1944 self
1945 }
1946
1947 pub fn null_count(mut self, null_count: usize) -> Self {
1949 self.null_count = Some(null_count);
1950 self
1951 }
1952
1953 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1955 self.nulls = None;
1956 self.null_bit_buffer = buf;
1957 self
1958 }
1959
1960 #[inline]
1962 pub const fn offset(mut self, n: usize) -> Self {
1963 self.offset = n;
1964 self
1965 }
1966
1967 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1969 self.buffers = v;
1970 self
1971 }
1972
1973 pub fn add_buffer(mut self, b: Buffer) -> Self {
1975 self.buffers.push(b);
1976 self
1977 }
1978
1979 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1981 self.buffers.extend(bs);
1982 self
1983 }
1984
1985 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1987 self.child_data = v;
1988 self
1989 }
1990
1991 pub fn add_child_data(mut self, r: ArrayData) -> Self {
1993 self.child_data.push(r);
1994 self
1995 }
1996
1997 pub unsafe fn build_unchecked(self) -> ArrayData {
2013 unsafe { self.skip_validation(true) }.build().unwrap()
2014 }
2015
2016 pub fn build(self) -> Result<ArrayData, ArrowError> {
2025 let Self {
2026 data_type,
2027 len,
2028 null_count,
2029 null_bit_buffer,
2030 nulls,
2031 offset,
2032 buffers,
2033 child_data,
2034 align_buffers,
2035 skip_validation,
2036 } = self;
2037
2038 let nulls = nulls
2039 .or_else(|| {
2040 let buffer = null_bit_buffer?;
2041 let buffer = BooleanBuffer::new(buffer, offset, len);
2042 Some(match null_count {
2043 Some(n) => {
2044 unsafe { NullBuffer::new_unchecked(buffer, n) }
2046 }
2047 None => NullBuffer::new(buffer),
2048 })
2049 })
2050 .filter(|b| b.null_count() != 0);
2051
2052 let mut data = ArrayData {
2053 data_type,
2054 len,
2055 offset,
2056 buffers,
2057 child_data,
2058 nulls,
2059 };
2060
2061 if align_buffers {
2062 data.align_buffers();
2063 }
2064
2065 if !skip_validation.get() || cfg!(feature = "force_validate") {
2067 data.validate_data()?;
2068 }
2069 Ok(data)
2070 }
2071
2072 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2074 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2075 self.align_buffers(true).build()
2076 }
2077
2078 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2094 self.align_buffers = align_buffers;
2095 self
2096 }
2097
2098 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2112 unsafe {
2113 self.skip_validation.set(skip_validation);
2114 }
2115 self
2116 }
2117}
2118
2119impl From<ArrayData> for ArrayDataBuilder {
2120 fn from(d: ArrayData) -> Self {
2121 Self {
2122 data_type: d.data_type,
2123 len: d.len,
2124 offset: d.offset,
2125 buffers: d.buffers,
2126 child_data: d.child_data,
2127 nulls: d.nulls,
2128 null_bit_buffer: None,
2129 null_count: None,
2130 align_buffers: false,
2131 skip_validation: UnsafeFlag::new(),
2132 }
2133 }
2134}
2135
2136#[cfg(test)]
2137mod tests {
2138 use super::*;
2139 use arrow_schema::{Field, Fields};
2140
2141 fn make_i32_buffer(n: usize) -> Buffer {
2145 Buffer::from_slice_ref(vec![42i32; n])
2146 }
2147
2148 fn make_f32_buffer(n: usize) -> Buffer {
2150 Buffer::from_slice_ref(vec![42f32; n])
2151 }
2152
2153 #[test]
2154 fn test_builder() {
2155 let v = (0..25).collect::<Vec<i32>>();
2157 let b1 = Buffer::from_slice_ref(&v);
2158 let arr_data = ArrayData::builder(DataType::Int32)
2159 .len(20)
2160 .offset(5)
2161 .add_buffer(b1)
2162 .null_bit_buffer(Some(Buffer::from([
2163 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2164 ])))
2165 .build()
2166 .unwrap();
2167
2168 assert_eq!(20, arr_data.len());
2169 assert_eq!(10, arr_data.null_count());
2170 assert_eq!(5, arr_data.offset());
2171 assert_eq!(1, arr_data.buffers().len());
2172 assert_eq!(
2173 Buffer::from_slice_ref(&v).as_slice(),
2174 arr_data.buffers()[0].as_slice()
2175 );
2176 }
2177
2178 #[test]
2179 fn test_builder_with_child_data() {
2180 let child_arr_data = ArrayData::try_new(
2181 DataType::Int32,
2182 5,
2183 None,
2184 0,
2185 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2186 vec![],
2187 )
2188 .unwrap();
2189
2190 let field = Arc::new(Field::new("x", DataType::Int32, true));
2191 let data_type = DataType::Struct(vec![field].into());
2192
2193 let arr_data = ArrayData::builder(data_type)
2194 .len(5)
2195 .offset(0)
2196 .add_child_data(child_arr_data.clone())
2197 .build()
2198 .unwrap();
2199
2200 assert_eq!(5, arr_data.len());
2201 assert_eq!(1, arr_data.child_data().len());
2202 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2203 }
2204
2205 #[test]
2206 fn test_null_count() {
2207 let mut bit_v: [u8; 2] = [0; 2];
2208 bit_util::set_bit(&mut bit_v, 0);
2209 bit_util::set_bit(&mut bit_v, 3);
2210 bit_util::set_bit(&mut bit_v, 10);
2211 let arr_data = ArrayData::builder(DataType::Int32)
2212 .len(16)
2213 .add_buffer(make_i32_buffer(16))
2214 .null_bit_buffer(Some(Buffer::from(bit_v)))
2215 .build()
2216 .unwrap();
2217 assert_eq!(13, arr_data.null_count());
2218
2219 let mut bit_v: [u8; 2] = [0; 2];
2221 bit_util::set_bit(&mut bit_v, 0);
2222 bit_util::set_bit(&mut bit_v, 3);
2223 bit_util::set_bit(&mut bit_v, 10);
2224 let arr_data = ArrayData::builder(DataType::Int32)
2225 .len(12)
2226 .offset(2)
2227 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2229 .build()
2230 .unwrap();
2231 assert_eq!(10, arr_data.null_count());
2232 }
2233
2234 #[test]
2235 fn test_null_buffer_ref() {
2236 let mut bit_v: [u8; 2] = [0; 2];
2237 bit_util::set_bit(&mut bit_v, 0);
2238 bit_util::set_bit(&mut bit_v, 3);
2239 bit_util::set_bit(&mut bit_v, 10);
2240 let arr_data = ArrayData::builder(DataType::Int32)
2241 .len(16)
2242 .add_buffer(make_i32_buffer(16))
2243 .null_bit_buffer(Some(Buffer::from(bit_v)))
2244 .build()
2245 .unwrap();
2246 assert!(arr_data.nulls().is_some());
2247 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2248 }
2249
2250 #[test]
2251 fn test_slice() {
2252 let mut bit_v: [u8; 2] = [0; 2];
2253 bit_util::set_bit(&mut bit_v, 0);
2254 bit_util::set_bit(&mut bit_v, 3);
2255 bit_util::set_bit(&mut bit_v, 10);
2256 let data = ArrayData::builder(DataType::Int32)
2257 .len(16)
2258 .add_buffer(make_i32_buffer(16))
2259 .null_bit_buffer(Some(Buffer::from(bit_v)))
2260 .build()
2261 .unwrap();
2262 let new_data = data.slice(1, 15);
2263 assert_eq!(data.len() - 1, new_data.len());
2264 assert_eq!(1, new_data.offset());
2265 assert_eq!(data.null_count(), new_data.null_count());
2266
2267 let new_data = new_data.slice(1, 14);
2269 assert_eq!(data.len() - 2, new_data.len());
2270 assert_eq!(2, new_data.offset());
2271 assert_eq!(data.null_count() - 1, new_data.null_count());
2272 }
2273
2274 #[test]
2275 fn test_equality() {
2276 let int_data = ArrayData::builder(DataType::Int32)
2277 .len(1)
2278 .add_buffer(make_i32_buffer(1))
2279 .build()
2280 .unwrap();
2281
2282 let float_data = ArrayData::builder(DataType::Float32)
2283 .len(1)
2284 .add_buffer(make_f32_buffer(1))
2285 .build()
2286 .unwrap();
2287 assert_ne!(int_data, float_data);
2288 assert!(!int_data.ptr_eq(&float_data));
2289 assert!(int_data.ptr_eq(&int_data));
2290
2291 #[allow(clippy::redundant_clone)]
2292 let int_data_clone = int_data.clone();
2293 assert_eq!(int_data, int_data_clone);
2294 assert!(int_data.ptr_eq(&int_data_clone));
2295 assert!(int_data_clone.ptr_eq(&int_data));
2296
2297 let int_data_slice = int_data_clone.slice(1, 0);
2298 assert!(int_data_slice.ptr_eq(&int_data_slice));
2299 assert!(!int_data.ptr_eq(&int_data_slice));
2300 assert!(!int_data_slice.ptr_eq(&int_data));
2301
2302 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2303 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2304 let string_data = ArrayData::try_new(
2305 DataType::Utf8,
2306 3,
2307 Some(Buffer::from_iter(vec![true, false, true])),
2308 0,
2309 vec![offsets_buffer, data_buffer],
2310 vec![],
2311 )
2312 .unwrap();
2313
2314 assert_ne!(float_data, string_data);
2315 assert!(!float_data.ptr_eq(&string_data));
2316
2317 assert!(string_data.ptr_eq(&string_data));
2318
2319 #[allow(clippy::redundant_clone)]
2320 let string_data_cloned = string_data.clone();
2321 assert!(string_data_cloned.ptr_eq(&string_data));
2322 assert!(string_data.ptr_eq(&string_data_cloned));
2323
2324 let string_data_slice = string_data.slice(1, 2);
2325 assert!(string_data_slice.ptr_eq(&string_data_slice));
2326 assert!(!string_data_slice.ptr_eq(&string_data))
2327 }
2328
2329 #[test]
2330 fn test_slice_memory_size() {
2331 let mut bit_v: [u8; 2] = [0; 2];
2332 bit_util::set_bit(&mut bit_v, 0);
2333 bit_util::set_bit(&mut bit_v, 3);
2334 bit_util::set_bit(&mut bit_v, 10);
2335 let data = ArrayData::builder(DataType::Int32)
2336 .len(16)
2337 .add_buffer(make_i32_buffer(16))
2338 .null_bit_buffer(Some(Buffer::from(bit_v)))
2339 .build()
2340 .unwrap();
2341 let new_data = data.slice(1, 14);
2342 assert_eq!(
2343 data.get_slice_memory_size().unwrap() - 8,
2344 new_data.get_slice_memory_size().unwrap()
2345 );
2346 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2347 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2348 let string_data = ArrayData::try_new(
2349 DataType::Utf8,
2350 3,
2351 Some(Buffer::from_iter(vec![true, false, true])),
2352 0,
2353 vec![offsets_buffer, data_buffer],
2354 vec![],
2355 )
2356 .unwrap();
2357 let string_data_slice = string_data.slice(1, 2);
2358 assert_eq!(
2360 string_data.get_slice_memory_size().unwrap() - 6,
2361 string_data_slice.get_slice_memory_size().unwrap()
2362 );
2363 }
2364
2365 #[test]
2366 fn test_count_nulls() {
2367 let buffer = Buffer::from([0b00010110, 0b10011111]);
2368 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2369 let count = count_nulls(Some(&buffer), 0, 16);
2370 assert_eq!(count, 7);
2371
2372 let count = count_nulls(Some(&buffer), 4, 8);
2373 assert_eq!(count, 3);
2374 }
2375
2376 #[test]
2377 fn test_contains_nulls() {
2378 let buffer: Buffer =
2379 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2380 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2381 assert!(contains_nulls(Some(&buffer), 0, 6));
2382 assert!(contains_nulls(Some(&buffer), 0, 3));
2383 assert!(!contains_nulls(Some(&buffer), 3, 2));
2384 assert!(!contains_nulls(Some(&buffer), 0, 0));
2385 }
2386
2387 #[test]
2388 fn test_alignment() {
2389 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2390 let sliced = buffer.slice(1);
2391
2392 let mut data = ArrayData {
2393 data_type: DataType::Int32,
2394 len: 0,
2395 offset: 0,
2396 buffers: vec![buffer],
2397 child_data: vec![],
2398 nulls: None,
2399 };
2400 data.validate_full().unwrap();
2401
2402 data.buffers[0] = sliced;
2404 let err = data.validate().unwrap_err();
2405
2406 assert_eq!(
2407 err.to_string(),
2408 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2409 );
2410
2411 data.align_buffers();
2412 data.validate_full().unwrap();
2413 }
2414
2415 #[test]
2416 fn test_alignment_struct() {
2417 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2418 let sliced = buffer.slice(1);
2419
2420 let child_data = ArrayData {
2421 data_type: DataType::Int32,
2422 len: 0,
2423 offset: 0,
2424 buffers: vec![buffer],
2425 child_data: vec![],
2426 nulls: None,
2427 };
2428
2429 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2430 let mut data = ArrayData {
2431 data_type: schema,
2432 len: 0,
2433 offset: 0,
2434 buffers: vec![],
2435 child_data: vec![child_data],
2436 nulls: None,
2437 };
2438 data.validate_full().unwrap();
2439
2440 data.child_data[0].buffers[0] = sliced;
2442 let err = data.validate().unwrap_err();
2443
2444 assert_eq!(
2445 err.to_string(),
2446 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2447 );
2448
2449 data.align_buffers();
2450 data.validate_full().unwrap();
2451 }
2452
2453 #[test]
2454 fn test_null_view_types() {
2455 let array_len = 32;
2456 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2457 assert_eq!(array.len(), array_len);
2458 for i in 0..array.len() {
2459 assert!(array.is_null(i));
2460 }
2461
2462 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2463 assert_eq!(array.len(), array_len);
2464 for i in 0..array.len() {
2465 assert!(array.is_null(i));
2466 }
2467 }
2468}