1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 unsafe { skip_validation.set(true) };
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
316 data_type: DataType,
317 len: usize,
318 null_bit_buffer: Option<Buffer>,
319 offset: usize,
320 buffers: Vec<Buffer>,
321 child_data: Vec<ArrayData>,
322 ) -> Result<Self, ArrowError> {
323 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
327 let needed_len = bit_util::ceil(len + offset, 8);
328 if null_bit_buffer.len() < needed_len {
329 return Err(ArrowError::InvalidArgumentError(format!(
330 "null_bit_buffer size too small. got {} needed {}",
331 null_bit_buffer.len(),
332 needed_len
333 )));
334 }
335 }
336 let new_self = unsafe {
338 Self::new_unchecked(
339 data_type,
340 len,
341 None,
342 null_bit_buffer,
343 offset,
344 buffers,
345 child_data,
346 )
347 };
348
349 new_self.validate_data()?;
354 Ok(new_self)
355 }
356
357 pub fn into_parts(
363 self,
364 ) -> (
365 DataType,
366 usize,
367 Option<NullBuffer>,
368 usize,
369 Vec<Buffer>,
370 Vec<ArrayData>,
371 ) {
372 let Self {
373 data_type,
374 len,
375 nulls,
376 offset,
377 buffers,
378 child_data,
379 } = self;
380
381 (data_type, len, nulls, offset, buffers, child_data)
382 }
383
384 #[inline]
386 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
387 ArrayDataBuilder::new(data_type)
388 }
389
390 #[inline]
392 pub const fn data_type(&self) -> &DataType {
393 &self.data_type
394 }
395
396 pub fn buffers(&self) -> &[Buffer] {
398 &self.buffers
399 }
400
401 pub fn child_data(&self) -> &[ArrayData] {
404 &self.child_data[..]
405 }
406
407 #[inline]
409 pub fn is_null(&self, i: usize) -> bool {
410 match &self.nulls {
411 Some(v) => v.is_null(i),
412 None => false,
413 }
414 }
415
416 #[inline]
420 pub fn nulls(&self) -> Option<&NullBuffer> {
421 self.nulls.as_ref()
422 }
423
424 #[inline]
426 pub fn is_valid(&self, i: usize) -> bool {
427 !self.is_null(i)
428 }
429
430 #[inline]
432 pub const fn len(&self) -> usize {
433 self.len
434 }
435
436 #[inline]
438 pub const fn is_empty(&self) -> bool {
439 self.len == 0
440 }
441
442 #[inline]
444 pub const fn offset(&self) -> usize {
445 self.offset
446 }
447
448 #[inline]
450 pub fn null_count(&self) -> usize {
451 self.nulls
452 .as_ref()
453 .map(|x| x.null_count())
454 .unwrap_or_default()
455 }
456
457 pub fn get_buffer_memory_size(&self) -> usize {
469 let mut size = 0;
470 for buffer in &self.buffers {
471 size += buffer.capacity();
472 }
473 if let Some(bitmap) = &self.nulls {
474 size += bitmap.buffer().capacity()
475 }
476 for child in &self.child_data {
477 size += child.get_buffer_memory_size();
478 }
479 size
480 }
481
482 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
495 let mut result: usize = 0;
496 let layout = layout(&self.data_type);
497
498 for spec in layout.buffers.iter() {
499 match spec {
500 BufferSpec::FixedWidth { byte_width, .. } => {
501 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
502 ArrowError::ComputeError(
503 "Integer overflow computing buffer size".to_string(),
504 )
505 })?;
506 result += buffer_size;
507 }
508 BufferSpec::VariableWidth => {
509 let buffer_len = match self.data_type {
510 DataType::Utf8 | DataType::Binary => {
511 let offsets = self.typed_offsets::<i32>()?;
512 (offsets[self.len] - offsets[0]) as usize
513 }
514 DataType::LargeUtf8 | DataType::LargeBinary => {
515 let offsets = self.typed_offsets::<i64>()?;
516 (offsets[self.len] - offsets[0]) as usize
517 }
518 _ => {
519 return Err(ArrowError::NotYetImplemented(format!(
520 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
521 self.data_type
522 )));
523 }
524 };
525 result += buffer_len;
526 }
527 BufferSpec::BitMap => {
528 let buffer_size = bit_util::ceil(self.len, 8);
529 result += buffer_size;
530 }
531 BufferSpec::AlwaysNull => {
532 }
534 }
535 }
536
537 if self.nulls().is_some() {
538 result += bit_util::ceil(self.len, 8);
539 }
540
541 for child in &self.child_data {
542 result += child.get_slice_memory_size()?;
543 }
544 Ok(result)
545 }
546
547 pub fn get_array_memory_size(&self) -> usize {
556 let mut size = mem::size_of_val(self);
557
558 for buffer in &self.buffers {
560 size += mem::size_of::<Buffer>();
561 size += buffer.capacity();
562 }
563 if let Some(nulls) = &self.nulls {
564 size += nulls.buffer().capacity();
565 }
566 for child in &self.child_data {
567 size += child.get_array_memory_size();
568 }
569
570 size
571 }
572
573 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
581 assert!((offset + length) <= self.len());
582
583 if let DataType::Struct(_) = self.data_type() {
584 let new_offset = self.offset + offset;
586 ArrayData {
587 data_type: self.data_type().clone(),
588 len: length,
589 offset: new_offset,
590 buffers: self.buffers.clone(),
591 child_data: self
593 .child_data()
594 .iter()
595 .map(|data| data.slice(offset, length))
596 .collect(),
597 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
598 }
599 } else {
600 let mut new_data = self.clone();
601
602 new_data.len = length;
603 new_data.offset = offset + self.offset;
604 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
605
606 new_data
607 }
608 }
609
610 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
617 &self.buffers()[buffer].typed_data()[self.offset..]
618 }
619
620 pub fn new_null(data_type: &DataType, len: usize) -> Self {
622 let bit_len = bit_util::ceil(len, 8);
623 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
624
625 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
626 Some(width) => (vec![zeroed(width * len)], vec![], true),
627 None => match data_type {
628 DataType::Null => (vec![], vec![], false),
629 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
630 DataType::Binary | DataType::Utf8 => {
631 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
632 }
633 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
634 DataType::LargeBinary | DataType::LargeUtf8 => {
635 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
636 }
637 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
638 DataType::List(f) | DataType::Map(f, _) => (
639 vec![zeroed((len + 1) * 4)],
640 vec![ArrayData::new_empty(f.data_type())],
641 true,
642 ),
643 DataType::LargeList(f) => (
644 vec![zeroed((len + 1) * 8)],
645 vec![ArrayData::new_empty(f.data_type())],
646 true,
647 ),
648 DataType::ListView(f) => (
649 vec![zeroed(len * 4), zeroed(len * 4)],
650 vec![ArrayData::new_empty(f.data_type())],
651 true,
652 ),
653 DataType::LargeListView(f) => (
654 vec![zeroed(len * 8), zeroed(len * 8)],
655 vec![ArrayData::new_empty(f.data_type())],
656 true,
657 ),
658 DataType::FixedSizeList(f, list_len) => (
659 vec![],
660 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
661 true,
662 ),
663 DataType::Struct(fields) => (
664 vec![],
665 fields
666 .iter()
667 .map(|f| Self::new_null(f.data_type(), len))
668 .collect(),
669 true,
670 ),
671 DataType::Dictionary(k, v) => (
672 vec![zeroed(k.primitive_width().unwrap() * len)],
673 vec![ArrayData::new_empty(v.as_ref())],
674 true,
675 ),
676 DataType::Union(f, mode) => {
677 let (id, _) = f.iter().next().unwrap();
678 let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
679 let buffers = match mode {
680 UnionMode::Sparse => vec![ids],
681 UnionMode::Dense => {
682 let end_offset = i32::from_usize(len).unwrap();
683 vec![ids, Buffer::from_iter(0_i32..end_offset)]
684 }
685 };
686
687 let children = f
688 .iter()
689 .enumerate()
690 .map(|(idx, (_, f))| {
691 if idx == 0 || *mode == UnionMode::Sparse {
692 Self::new_null(f.data_type(), len)
693 } else {
694 Self::new_empty(f.data_type())
695 }
696 })
697 .collect();
698
699 (buffers, children, false)
700 }
701 DataType::RunEndEncoded(r, v) => {
702 let runs = match r.data_type() {
703 DataType::Int16 => {
704 let i = i16::from_usize(len).expect("run overflow");
705 Buffer::from_slice_ref([i])
706 }
707 DataType::Int32 => {
708 let i = i32::from_usize(len).expect("run overflow");
709 Buffer::from_slice_ref([i])
710 }
711 DataType::Int64 => {
712 let i = i64::from_usize(len).expect("run overflow");
713 Buffer::from_slice_ref([i])
714 }
715 dt => unreachable!("Invalid run ends data type {dt}"),
716 };
717
718 let builder = ArrayData::builder(r.data_type().clone())
719 .len(1)
720 .buffers(vec![runs]);
721
722 let runs = unsafe { builder.build_unchecked() };
725 (
726 vec![],
727 vec![runs, ArrayData::new_null(v.data_type(), 1)],
728 false,
729 )
730 }
731 DataType::Int8
733 | DataType::Int16
734 | DataType::Int32
735 | DataType::Int64
736 | DataType::UInt8
737 | DataType::UInt16
738 | DataType::UInt32
739 | DataType::UInt64
740 | DataType::Float16
741 | DataType::Float32
742 | DataType::Float64
743 | DataType::Timestamp(_, _)
744 | DataType::Date32
745 | DataType::Date64
746 | DataType::Time32(_)
747 | DataType::Time64(_)
748 | DataType::Duration(_)
749 | DataType::Interval(_)
750 | DataType::Decimal32(_, _)
751 | DataType::Decimal64(_, _)
752 | DataType::Decimal128(_, _)
753 | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
754 },
755 };
756
757 let mut builder = ArrayDataBuilder::new(data_type.clone())
758 .len(len)
759 .buffers(buffers)
760 .child_data(child_data);
761
762 if has_nulls {
763 builder = builder.nulls(Some(NullBuffer::new_null(len)))
764 }
765
766 unsafe { builder.build_unchecked() }
769 }
770
771 pub fn new_empty(data_type: &DataType) -> Self {
773 Self::new_null(data_type, 0)
774 }
775
776 pub fn align_buffers(&mut self) {
785 let layout = layout(&self.data_type);
786 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
787 if let BufferSpec::FixedWidth { alignment, .. } = spec {
788 if buffer.as_ptr().align_offset(*alignment) != 0 {
789 *buffer = Buffer::from_slice_ref(buffer.as_ref());
790 }
791 }
792 }
793 for data in self.child_data.iter_mut() {
795 data.align_buffers()
796 }
797 }
798
799 pub fn validate(&self) -> Result<(), ArrowError> {
810 let len_plus_offset = self.len + self.offset;
812
813 let layout = layout(&self.data_type);
815
816 if !layout.can_contain_null_mask && self.nulls.is_some() {
817 return Err(ArrowError::InvalidArgumentError(format!(
818 "Arrays of type {:?} cannot contain a null bitmask",
819 self.data_type,
820 )));
821 }
822
823 if self.buffers.len() < layout.buffers.len()
825 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
826 {
827 return Err(ArrowError::InvalidArgumentError(format!(
828 "Expected {} buffers in array of type {:?}, got {}",
829 layout.buffers.len(),
830 self.data_type,
831 self.buffers.len(),
832 )));
833 }
834
835 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
836 match spec {
837 BufferSpec::FixedWidth {
838 byte_width,
839 alignment,
840 } => {
841 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
842
843 if buffer.len() < min_buffer_size {
844 return Err(ArrowError::InvalidArgumentError(format!(
845 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
846 min_buffer_size,
847 i,
848 self.data_type,
849 buffer.len()
850 )));
851 }
852
853 let align_offset = buffer.as_ptr().align_offset(*alignment);
854 if align_offset != 0 {
855 return Err(ArrowError::InvalidArgumentError(format!(
856 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
857 self.data_type,
858 align_offset.min(alignment - align_offset)
859 )));
860 }
861 }
862 BufferSpec::VariableWidth => {
863 }
867 BufferSpec::BitMap => {
868 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
869 if buffer.len() < min_buffer_size {
870 return Err(ArrowError::InvalidArgumentError(format!(
871 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
872 min_buffer_size,
873 i,
874 self.data_type,
875 buffer.len()
876 )));
877 }
878 }
879 BufferSpec::AlwaysNull => {
880 }
882 }
883 }
884
885 if let Some(nulls) = self.nulls() {
887 if nulls.null_count() > self.len {
888 return Err(ArrowError::InvalidArgumentError(format!(
889 "null_count {} for an array exceeds length of {} elements",
890 nulls.null_count(),
891 self.len
892 )));
893 }
894
895 let actual_len = nulls.validity().len();
896 let needed_len = bit_util::ceil(len_plus_offset, 8);
897 if actual_len < needed_len {
898 return Err(ArrowError::InvalidArgumentError(format!(
899 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
900 )));
901 }
902
903 if nulls.len() != self.len {
904 return Err(ArrowError::InvalidArgumentError(format!(
905 "null buffer incorrect size. got {} expected {}",
906 nulls.len(),
907 self.len
908 )));
909 }
910 }
911
912 self.validate_child_data()?;
913
914 match &self.data_type {
916 DataType::Utf8 | DataType::Binary => {
917 self.validate_offsets::<i32>(self.buffers[1].len())?;
918 }
919 DataType::LargeUtf8 | DataType::LargeBinary => {
920 self.validate_offsets::<i64>(self.buffers[1].len())?;
921 }
922 DataType::Dictionary(key_type, _value_type) => {
923 if !DataType::is_dictionary_key_type(key_type) {
925 return Err(ArrowError::InvalidArgumentError(format!(
926 "Dictionary key type must be integer, but was {key_type}"
927 )));
928 }
929 }
930 DataType::RunEndEncoded(run_ends_type, _) => {
931 if run_ends_type.is_nullable() {
932 return Err(ArrowError::InvalidArgumentError(
933 "The nullable should be set to false for the field defining run_ends array.".to_string()
934 ));
935 }
936 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
937 return Err(ArrowError::InvalidArgumentError(format!(
938 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
939 run_ends_type.data_type()
940 )));
941 }
942 }
943 _ => {}
944 };
945
946 Ok(())
947 }
948
949 fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
956 if self.len == 0 && self.buffers[0].is_empty() {
958 return Ok(&[]);
959 }
960
961 self.typed_buffer(0, self.len + 1)
962 }
963
964 fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
966 &self,
967 idx: usize,
968 len: usize,
969 ) -> Result<&[T], ArrowError> {
970 let buffer = &self.buffers[idx];
971
972 let required_len = (len + self.offset) * mem::size_of::<T>();
973
974 if buffer.len() < required_len {
975 return Err(ArrowError::InvalidArgumentError(format!(
976 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
977 idx,
978 self.data_type,
979 required_len,
980 buffer.len()
981 )));
982 }
983
984 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
985 }
986
987 fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
990 &self,
991 values_length: usize,
992 ) -> Result<(), ArrowError> {
993 let offsets = self.typed_offsets::<T>()?;
995 if offsets.is_empty() {
996 return Ok(());
997 }
998
999 let first_offset = offsets[0].to_usize().ok_or_else(|| {
1000 ArrowError::InvalidArgumentError(format!(
1001 "Error converting offset[0] ({}) to usize for {}",
1002 offsets[0], self.data_type
1003 ))
1004 })?;
1005
1006 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1007 ArrowError::InvalidArgumentError(format!(
1008 "Error converting offset[{}] ({}) to usize for {}",
1009 self.len, offsets[self.len], self.data_type
1010 ))
1011 })?;
1012
1013 if first_offset > values_length {
1014 return Err(ArrowError::InvalidArgumentError(format!(
1015 "First offset {} of {} is larger than values length {}",
1016 first_offset, self.data_type, values_length,
1017 )));
1018 }
1019
1020 if last_offset > values_length {
1021 return Err(ArrowError::InvalidArgumentError(format!(
1022 "Last offset {} of {} is larger than values length {}",
1023 last_offset, self.data_type, values_length,
1024 )));
1025 }
1026
1027 if first_offset > last_offset {
1028 return Err(ArrowError::InvalidArgumentError(format!(
1029 "First offset {} in {} is smaller than last offset {}",
1030 first_offset, self.data_type, last_offset,
1031 )));
1032 }
1033
1034 Ok(())
1035 }
1036
1037 fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1040 &self,
1041 values_length: usize,
1042 ) -> Result<(), ArrowError> {
1043 let offsets: &[T] = self.typed_buffer(0, self.len)?;
1044 let sizes: &[T] = self.typed_buffer(1, self.len)?;
1045 if offsets.len() != sizes.len() {
1046 return Err(ArrowError::ComputeError(format!(
1047 "ListView offsets len {} does not match sizes len {}",
1048 offsets.len(),
1049 sizes.len()
1050 )));
1051 }
1052
1053 for i in 0..sizes.len() {
1054 let size = sizes[i].to_usize().ok_or_else(|| {
1055 ArrowError::InvalidArgumentError(format!(
1056 "Error converting size[{}] ({}) to usize for {}",
1057 i, sizes[i], self.data_type
1058 ))
1059 })?;
1060 let offset = offsets[i].to_usize().ok_or_else(|| {
1061 ArrowError::InvalidArgumentError(format!(
1062 "Error converting offset[{}] ({}) to usize for {}",
1063 i, offsets[i], self.data_type
1064 ))
1065 })?;
1066 if size
1067 .checked_add(offset)
1068 .expect("Offset and size have exceeded the usize boundary")
1069 > values_length
1070 {
1071 return Err(ArrowError::InvalidArgumentError(format!(
1072 "Size {} at index {} is larger than the remaining values for {}",
1073 size, i, self.data_type
1074 )));
1075 }
1076 }
1077 Ok(())
1078 }
1079
1080 fn validate_child_data(&self) -> Result<(), ArrowError> {
1082 match &self.data_type {
1083 DataType::List(field) | DataType::Map(field, _) => {
1084 let values_data = self.get_single_valid_child_data(field.data_type())?;
1085 self.validate_offsets::<i32>(values_data.len)?;
1086 Ok(())
1087 }
1088 DataType::LargeList(field) => {
1089 let values_data = self.get_single_valid_child_data(field.data_type())?;
1090 self.validate_offsets::<i64>(values_data.len)?;
1091 Ok(())
1092 }
1093 DataType::ListView(field) => {
1094 let values_data = self.get_single_valid_child_data(field.data_type())?;
1095 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1096 Ok(())
1097 }
1098 DataType::LargeListView(field) => {
1099 let values_data = self.get_single_valid_child_data(field.data_type())?;
1100 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1101 Ok(())
1102 }
1103 DataType::FixedSizeList(field, list_size) => {
1104 let values_data = self.get_single_valid_child_data(field.data_type())?;
1105
1106 let list_size: usize = (*list_size).try_into().map_err(|_| {
1107 ArrowError::InvalidArgumentError(format!(
1108 "{} has a negative list_size {}",
1109 self.data_type, list_size
1110 ))
1111 })?;
1112
1113 let expected_values_len = self.len
1114 .checked_mul(list_size)
1115 .expect("integer overflow computing expected number of expected values in FixedListSize");
1116
1117 if values_data.len < expected_values_len {
1118 return Err(ArrowError::InvalidArgumentError(format!(
1119 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1120 values_data.len, self.len, list_size, self.data_type
1121 )));
1122 }
1123
1124 Ok(())
1125 }
1126 DataType::Struct(fields) => {
1127 self.validate_num_child_data(fields.len())?;
1128 for (i, field) in fields.iter().enumerate() {
1129 let field_data = self.get_valid_child_data(i, field.data_type())?;
1130
1131 if field_data.len < self.len {
1133 return Err(ArrowError::InvalidArgumentError(format!(
1134 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1135 self.data_type,
1136 i,
1137 field.name(),
1138 field_data.len,
1139 self.len
1140 )));
1141 }
1142 }
1143 Ok(())
1144 }
1145 DataType::RunEndEncoded(run_ends_field, values_field) => {
1146 self.validate_num_child_data(2)?;
1147 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1148 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1149 if run_ends_data.len != values_data.len {
1150 return Err(ArrowError::InvalidArgumentError(format!(
1151 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1152 run_ends_data.len, values_data.len
1153 )));
1154 }
1155 if run_ends_data.nulls.is_some() {
1156 return Err(ArrowError::InvalidArgumentError(
1157 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1158 ));
1159 }
1160 Ok(())
1161 }
1162 DataType::Union(fields, mode) => {
1163 self.validate_num_child_data(fields.len())?;
1164
1165 for (i, (_, field)) in fields.iter().enumerate() {
1166 let field_data = self.get_valid_child_data(i, field.data_type())?;
1167
1168 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1169 return Err(ArrowError::InvalidArgumentError(format!(
1170 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1171 i,
1172 field_data.len,
1173 self.len + self.offset
1174 )));
1175 }
1176 }
1177 Ok(())
1178 }
1179 DataType::Dictionary(_key_type, value_type) => {
1180 self.get_single_valid_child_data(value_type)?;
1181 Ok(())
1182 }
1183 _ => {
1184 if !self.child_data.is_empty() {
1186 return Err(ArrowError::InvalidArgumentError(format!(
1187 "Expected no child arrays for type {} but got {}",
1188 self.data_type,
1189 self.child_data.len()
1190 )));
1191 }
1192 Ok(())
1193 }
1194 }
1195 }
1196
1197 fn get_single_valid_child_data(
1201 &self,
1202 expected_type: &DataType,
1203 ) -> Result<&ArrayData, ArrowError> {
1204 self.validate_num_child_data(1)?;
1205 self.get_valid_child_data(0, expected_type)
1206 }
1207
1208 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1210 if self.child_data.len() != expected_len {
1211 Err(ArrowError::InvalidArgumentError(format!(
1212 "Value data for {} should contain {} child data array(s), had {}",
1213 self.data_type,
1214 expected_len,
1215 self.child_data.len()
1216 )))
1217 } else {
1218 Ok(())
1219 }
1220 }
1221
1222 fn get_valid_child_data(
1225 &self,
1226 i: usize,
1227 expected_type: &DataType,
1228 ) -> Result<&ArrayData, ArrowError> {
1229 let values_data = self.child_data.get(i).ok_or_else(|| {
1230 ArrowError::InvalidArgumentError(format!(
1231 "{} did not have enough child arrays. Expected at least {} but had only {}",
1232 self.data_type,
1233 i + 1,
1234 self.child_data.len()
1235 ))
1236 })?;
1237
1238 if expected_type != &values_data.data_type {
1239 return Err(ArrowError::InvalidArgumentError(format!(
1240 "Child type mismatch for {}. Expected {} but child data had {}",
1241 self.data_type, expected_type, values_data.data_type
1242 )));
1243 }
1244
1245 values_data.validate()?;
1246 Ok(values_data)
1247 }
1248
1249 pub fn validate_data(&self) -> Result<(), ArrowError> {
1265 self.validate()?;
1266
1267 self.validate_nulls()?;
1268 self.validate_values()?;
1269 Ok(())
1270 }
1271
1272 pub fn validate_full(&self) -> Result<(), ArrowError> {
1277 self.validate_data()?;
1278 self.child_data
1280 .iter()
1281 .enumerate()
1282 .try_for_each(|(i, child_data)| {
1283 child_data.validate_full().map_err(|e| {
1284 ArrowError::InvalidArgumentError(format!(
1285 "{} child #{} invalid: {}",
1286 self.data_type, i, e
1287 ))
1288 })
1289 })?;
1290 Ok(())
1291 }
1292
1293 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1303 if let Some(nulls) = &self.nulls {
1304 let actual = nulls.len() - nulls.inner().count_set_bits();
1305 if actual != nulls.null_count() {
1306 return Err(ArrowError::InvalidArgumentError(format!(
1307 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1308 nulls.null_count(),
1309 actual
1310 )));
1311 }
1312 }
1313
1314 match &self.data_type {
1319 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1320 if !f.is_nullable() {
1321 self.validate_non_nullable(None, &self.child_data[0])?
1322 }
1323 }
1324 DataType::FixedSizeList(field, len) => {
1325 let child = &self.child_data[0];
1326 if !field.is_nullable() {
1327 match &self.nulls {
1328 Some(nulls) => {
1329 let element_len = *len as usize;
1330 let expanded = nulls.expand(element_len);
1331 self.validate_non_nullable(Some(&expanded), child)?;
1332 }
1333 None => self.validate_non_nullable(None, child)?,
1334 }
1335 }
1336 }
1337 DataType::Struct(fields) => {
1338 for (field, child) in fields.iter().zip(&self.child_data) {
1339 if !field.is_nullable() {
1340 self.validate_non_nullable(self.nulls(), child)?
1341 }
1342 }
1343 }
1344 _ => {}
1345 }
1346
1347 Ok(())
1348 }
1349
1350 fn validate_non_nullable(
1352 &self,
1353 mask: Option<&NullBuffer>,
1354 child: &ArrayData,
1355 ) -> Result<(), ArrowError> {
1356 let mask = match mask {
1357 Some(mask) => mask,
1358 None => {
1359 return match child.null_count() {
1360 0 => Ok(()),
1361 _ => Err(ArrowError::InvalidArgumentError(format!(
1362 "non-nullable child of type {} contains nulls not present in parent {}",
1363 child.data_type, self.data_type
1364 ))),
1365 };
1366 }
1367 };
1368
1369 match child.nulls() {
1370 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1371 "non-nullable child of type {} contains nulls not present in parent",
1372 child.data_type
1373 ))),
1374 _ => Ok(()),
1375 }
1376 }
1377
1378 pub fn validate_values(&self) -> Result<(), ArrowError> {
1384 match &self.data_type {
1385 DataType::Utf8 => self.validate_utf8::<i32>(),
1386 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1387 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1388 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1389 DataType::BinaryView => {
1390 let views = self.typed_buffer::<u128>(0, self.len)?;
1391 validate_binary_view(views, &self.buffers[1..])
1392 }
1393 DataType::Utf8View => {
1394 let views = self.typed_buffer::<u128>(0, self.len)?;
1395 validate_string_view(views, &self.buffers[1..])
1396 }
1397 DataType::List(_) | DataType::Map(_, _) => {
1398 let child = &self.child_data[0];
1399 self.validate_offsets_full::<i32>(child.len)
1400 }
1401 DataType::LargeList(_) => {
1402 let child = &self.child_data[0];
1403 self.validate_offsets_full::<i64>(child.len)
1404 }
1405 DataType::Union(_, _) => {
1406 Ok(())
1412 }
1413 DataType::Dictionary(key_type, _value_type) => {
1414 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1415 let max_value = dictionary_length - 1;
1416 match key_type.as_ref() {
1417 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1418 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1419 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1420 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1421 DataType::Int8 => self.check_bounds::<i8>(max_value),
1422 DataType::Int16 => self.check_bounds::<i16>(max_value),
1423 DataType::Int32 => self.check_bounds::<i32>(max_value),
1424 DataType::Int64 => self.check_bounds::<i64>(max_value),
1425 _ => unreachable!(),
1426 }
1427 }
1428 DataType::RunEndEncoded(run_ends, _values) => {
1429 let run_ends_data = self.child_data()[0].clone();
1430 match run_ends.data_type() {
1431 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1432 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1433 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1434 _ => unreachable!(),
1435 }
1436 }
1437 _ => {
1438 Ok(())
1440 }
1441 }
1442 }
1443
1444 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1455 where
1456 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1457 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1458 {
1459 self.typed_offsets::<T>()?
1460 .iter()
1461 .enumerate()
1462 .map(|(i, x)| {
1463 let r = x.to_usize().ok_or_else(|| {
1465 ArrowError::InvalidArgumentError(format!(
1466 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1467 );
1468 match r {
1470 Ok(n) if n <= offset_limit => Ok((i, n)),
1471 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1472 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1473 ),
1474 Err(e) => Err(e),
1475 }
1476 })
1477 .scan(0_usize, |start, end| {
1478 match end {
1480 Ok((i, end)) if *start <= end => {
1481 let range = Some(Ok((i, *start..end)));
1482 *start = end;
1483 range
1484 }
1485 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1486 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1487 i - 1, start, end))
1488 )),
1489 Err(err) => Some(Err(err)),
1490 }
1491 })
1492 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1494 let (item_index, range) = res?;
1495 validate(item_index-1, range)
1496 })
1497 }
1498
1499 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1502 where
1503 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1504 {
1505 let values_buffer = &self.buffers[1].as_slice();
1506 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1507 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1509 if !values_str.is_char_boundary(range.start)
1510 || !values_str.is_char_boundary(range.end)
1511 {
1512 return Err(ArrowError::InvalidArgumentError(format!(
1513 "incomplete utf-8 byte sequence from index {string_index}"
1514 )));
1515 }
1516 Ok(())
1517 })
1518 } else {
1519 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1521 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1522 ArrowError::InvalidArgumentError(format!(
1523 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1524 ))
1525 })?;
1526 Ok(())
1527 })
1528 }
1529 }
1530
1531 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1534 where
1535 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1536 {
1537 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1538 Ok(())
1541 })
1542 }
1543
1544 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1547 where
1548 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1549 {
1550 let required_len = self.len + self.offset;
1551 let buffer = &self.buffers[0];
1552
1553 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1556
1557 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1559
1560 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1561 if self.is_null(i) {
1563 return Ok(());
1564 }
1565 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1566 ArrowError::InvalidArgumentError(format!(
1567 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1568 ))
1569 })?;
1570
1571 if dict_index < 0 || dict_index > max_value {
1572 return Err(ArrowError::InvalidArgumentError(format!(
1573 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1574 )));
1575 }
1576 Ok(())
1577 })
1578 }
1579
1580 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1582 where
1583 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1584 {
1585 let values = self.typed_buffer::<T>(0, self.len)?;
1586 let mut prev_value: i64 = 0_i64;
1587 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1588 let value: i64 = inp_value.try_into().map_err(|_| {
1589 ArrowError::InvalidArgumentError(format!(
1590 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1591 ))
1592 })?;
1593 if value <= 0_i64 {
1594 return Err(ArrowError::InvalidArgumentError(format!(
1595 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1596 )));
1597 }
1598 if ix > 0 && value <= prev_value {
1599 return Err(ArrowError::InvalidArgumentError(format!(
1600 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1601 )));
1602 }
1603
1604 prev_value = value;
1605 Ok(())
1606 })?;
1607
1608 if prev_value.as_usize() < (self.offset + self.len) {
1609 return Err(ArrowError::InvalidArgumentError(format!(
1610 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1611 self.offset + self.len
1612 )));
1613 }
1614 Ok(())
1615 }
1616
1617 pub fn ptr_eq(&self, other: &Self) -> bool {
1621 if self.offset != other.offset
1622 || self.len != other.len
1623 || self.data_type != other.data_type
1624 || self.buffers.len() != other.buffers.len()
1625 || self.child_data.len() != other.child_data.len()
1626 {
1627 return false;
1628 }
1629
1630 match (&self.nulls, &other.nulls) {
1631 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1632 (Some(_), None) | (None, Some(_)) => return false,
1633 _ => {}
1634 };
1635
1636 if !self
1637 .buffers
1638 .iter()
1639 .zip(other.buffers.iter())
1640 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1641 {
1642 return false;
1643 }
1644
1645 self.child_data
1646 .iter()
1647 .zip(other.child_data.iter())
1648 .all(|(a, b)| a.ptr_eq(b))
1649 }
1650
1651 pub fn into_builder(self) -> ArrayDataBuilder {
1653 self.into()
1654 }
1655}
1656
1657pub fn layout(data_type: &DataType) -> DataTypeLayout {
1660 use arrow_schema::IntervalUnit::*;
1663
1664 match data_type {
1665 DataType::Null => DataTypeLayout {
1666 buffers: vec![],
1667 can_contain_null_mask: false,
1668 variadic: false,
1669 },
1670 DataType::Boolean => DataTypeLayout {
1671 buffers: vec![BufferSpec::BitMap],
1672 can_contain_null_mask: true,
1673 variadic: false,
1674 },
1675 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1676 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1677 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1678 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1679 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1680 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1681 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1682 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1683 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1684 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1685 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1686 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1687 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1688 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1689 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1690 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1691 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1692 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1693 DataType::Interval(MonthDayNano) => {
1694 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1695 }
1696 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1697 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1698 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1699 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1700 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1701 DataType::FixedSizeBinary(size) => {
1702 let spec = BufferSpec::FixedWidth {
1703 byte_width: (*size).try_into().unwrap(),
1704 alignment: mem::align_of::<u8>(),
1705 };
1706 DataTypeLayout {
1707 buffers: vec![spec],
1708 can_contain_null_mask: true,
1709 variadic: false,
1710 }
1711 }
1712 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1713 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1714 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1715 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1716 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1717 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1719 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1720 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1721 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1722 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1723 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1726 let type_ids = BufferSpec::FixedWidth {
1727 byte_width: mem::size_of::<i8>(),
1728 alignment: mem::align_of::<i8>(),
1729 };
1730
1731 DataTypeLayout {
1732 buffers: match mode {
1733 UnionMode::Sparse => {
1734 vec![type_ids]
1735 }
1736 UnionMode::Dense => {
1737 vec![
1738 type_ids,
1739 BufferSpec::FixedWidth {
1740 byte_width: mem::size_of::<i32>(),
1741 alignment: mem::align_of::<i32>(),
1742 },
1743 ]
1744 }
1745 },
1746 can_contain_null_mask: false,
1747 variadic: false,
1748 }
1749 }
1750 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1751 }
1752}
1753
1754#[derive(Debug, PartialEq, Eq)]
1756pub struct DataTypeLayout {
1758 pub buffers: Vec<BufferSpec>,
1760
1761 pub can_contain_null_mask: bool,
1763
1764 pub variadic: bool,
1768}
1769
1770impl DataTypeLayout {
1771 pub fn new_fixed_width<T>() -> Self {
1773 Self {
1774 buffers: vec![BufferSpec::FixedWidth {
1775 byte_width: mem::size_of::<T>(),
1776 alignment: mem::align_of::<T>(),
1777 }],
1778 can_contain_null_mask: true,
1779 variadic: false,
1780 }
1781 }
1782
1783 pub fn new_nullable_empty() -> Self {
1786 Self {
1787 buffers: vec![],
1788 can_contain_null_mask: true,
1789 variadic: false,
1790 }
1791 }
1792
1793 pub fn new_empty() -> Self {
1796 Self {
1797 buffers: vec![],
1798 can_contain_null_mask: false,
1799 variadic: false,
1800 }
1801 }
1802
1803 pub fn new_binary<T>() -> Self {
1807 Self {
1808 buffers: vec![
1809 BufferSpec::FixedWidth {
1811 byte_width: mem::size_of::<T>(),
1812 alignment: mem::align_of::<T>(),
1813 },
1814 BufferSpec::VariableWidth,
1816 ],
1817 can_contain_null_mask: true,
1818 variadic: false,
1819 }
1820 }
1821
1822 pub fn new_view() -> Self {
1824 Self {
1825 buffers: vec![BufferSpec::FixedWidth {
1826 byte_width: mem::size_of::<u128>(),
1827 alignment: mem::align_of::<u128>(),
1828 }],
1829 can_contain_null_mask: true,
1830 variadic: true,
1831 }
1832 }
1833
1834 pub fn new_list_view<T>() -> Self {
1836 Self {
1837 buffers: vec![
1838 BufferSpec::FixedWidth {
1839 byte_width: mem::size_of::<T>(),
1840 alignment: mem::align_of::<T>(),
1841 },
1842 BufferSpec::FixedWidth {
1843 byte_width: mem::size_of::<T>(),
1844 alignment: mem::align_of::<T>(),
1845 },
1846 ],
1847 can_contain_null_mask: true,
1848 variadic: false,
1849 }
1850 }
1851}
1852
1853#[derive(Debug, PartialEq, Eq)]
1855pub enum BufferSpec {
1856 FixedWidth {
1867 byte_width: usize,
1869 alignment: usize,
1871 },
1872 VariableWidth,
1874 BitMap,
1880 #[allow(dead_code)]
1883 AlwaysNull,
1884}
1885
1886impl PartialEq for ArrayData {
1887 fn eq(&self, other: &Self) -> bool {
1888 equal::equal(self, other)
1889 }
1890}
1891
1892#[derive(Debug, Clone)]
1911#[doc(hidden)]
1912pub struct UnsafeFlag(bool);
1913
1914impl UnsafeFlag {
1915 #[inline]
1919 pub const fn new() -> Self {
1920 Self(false)
1921 }
1922
1923 #[inline]
1933 pub unsafe fn set(&mut self, val: bool) {
1934 self.0 = val;
1935 }
1936
1937 #[inline]
1939 pub fn get(&self) -> bool {
1940 self.0
1941 }
1942}
1943
1944impl Default for UnsafeFlag {
1946 fn default() -> Self {
1947 Self::new()
1948 }
1949}
1950
1951#[derive(Debug)]
1953pub struct ArrayDataBuilder {
1954 data_type: DataType,
1955 len: usize,
1956 null_count: Option<usize>,
1957 null_bit_buffer: Option<Buffer>,
1958 nulls: Option<NullBuffer>,
1959 offset: usize,
1960 buffers: Vec<Buffer>,
1961 child_data: Vec<ArrayData>,
1962 align_buffers: bool,
1966 skip_validation: UnsafeFlag,
1976}
1977
1978impl ArrayDataBuilder {
1979 #[inline]
1980 pub const fn new(data_type: DataType) -> Self {
1982 Self {
1983 data_type,
1984 len: 0,
1985 null_count: None,
1986 null_bit_buffer: None,
1987 nulls: None,
1988 offset: 0,
1989 buffers: vec![],
1990 child_data: vec![],
1991 align_buffers: false,
1992 skip_validation: UnsafeFlag::new(),
1993 }
1994 }
1995
1996 pub fn data_type(self, data_type: DataType) -> Self {
1998 Self { data_type, ..self }
1999 }
2000
2001 #[inline]
2002 #[allow(clippy::len_without_is_empty)]
2003 pub const fn len(mut self, n: usize) -> Self {
2005 self.len = n;
2006 self
2007 }
2008
2009 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2011 self.nulls = nulls;
2012 self.null_count = None;
2013 self.null_bit_buffer = None;
2014 self
2015 }
2016
2017 pub fn null_count(mut self, null_count: usize) -> Self {
2019 self.null_count = Some(null_count);
2020 self
2021 }
2022
2023 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2025 self.nulls = None;
2026 self.null_bit_buffer = buf;
2027 self
2028 }
2029
2030 #[inline]
2032 pub const fn offset(mut self, n: usize) -> Self {
2033 self.offset = n;
2034 self
2035 }
2036
2037 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2039 self.buffers = v;
2040 self
2041 }
2042
2043 pub fn add_buffer(mut self, b: Buffer) -> Self {
2045 self.buffers.push(b);
2046 self
2047 }
2048
2049 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2051 self.buffers.extend(bs);
2052 self
2053 }
2054
2055 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2057 self.child_data = v;
2058 self
2059 }
2060
2061 pub fn add_child_data(mut self, r: ArrayData) -> Self {
2063 self.child_data.push(r);
2064 self
2065 }
2066
2067 pub unsafe fn build_unchecked(self) -> ArrayData {
2083 unsafe { self.skip_validation(true) }.build().unwrap()
2084 }
2085
2086 pub fn build(self) -> Result<ArrayData, ArrowError> {
2095 let Self {
2096 data_type,
2097 len,
2098 null_count,
2099 null_bit_buffer,
2100 nulls,
2101 offset,
2102 buffers,
2103 child_data,
2104 align_buffers,
2105 skip_validation,
2106 } = self;
2107
2108 let nulls = nulls
2109 .or_else(|| {
2110 let buffer = null_bit_buffer?;
2111 let buffer = BooleanBuffer::new(buffer, offset, len);
2112 Some(match null_count {
2113 Some(n) => {
2114 unsafe { NullBuffer::new_unchecked(buffer, n) }
2116 }
2117 None => NullBuffer::new(buffer),
2118 })
2119 })
2120 .filter(|b| b.null_count() != 0);
2121
2122 let mut data = ArrayData {
2123 data_type,
2124 len,
2125 offset,
2126 buffers,
2127 child_data,
2128 nulls,
2129 };
2130
2131 if align_buffers {
2132 data.align_buffers();
2133 }
2134
2135 if !skip_validation.get() || cfg!(feature = "force_validate") {
2137 data.validate_data()?;
2138 }
2139 Ok(data)
2140 }
2141
2142 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2144 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2145 self.align_buffers(true).build()
2146 }
2147
2148 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2164 self.align_buffers = align_buffers;
2165 self
2166 }
2167
2168 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2182 unsafe {
2183 self.skip_validation.set(skip_validation);
2184 }
2185 self
2186 }
2187}
2188
2189impl From<ArrayData> for ArrayDataBuilder {
2190 fn from(d: ArrayData) -> Self {
2191 Self {
2192 data_type: d.data_type,
2193 len: d.len,
2194 offset: d.offset,
2195 buffers: d.buffers,
2196 child_data: d.child_data,
2197 nulls: d.nulls,
2198 null_bit_buffer: None,
2199 null_count: None,
2200 align_buffers: false,
2201 skip_validation: UnsafeFlag::new(),
2202 }
2203 }
2204}
2205
2206#[cfg(test)]
2207mod tests {
2208 use super::*;
2209 use arrow_schema::{Field, Fields};
2210
2211 fn make_i32_buffer(n: usize) -> Buffer {
2215 Buffer::from_slice_ref(vec![42i32; n])
2216 }
2217
2218 fn make_f32_buffer(n: usize) -> Buffer {
2220 Buffer::from_slice_ref(vec![42f32; n])
2221 }
2222
2223 #[test]
2224 fn test_builder() {
2225 let v = (0..25).collect::<Vec<i32>>();
2227 let b1 = Buffer::from_slice_ref(&v);
2228 let arr_data = ArrayData::builder(DataType::Int32)
2229 .len(20)
2230 .offset(5)
2231 .add_buffer(b1)
2232 .null_bit_buffer(Some(Buffer::from([
2233 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2234 ])))
2235 .build()
2236 .unwrap();
2237
2238 assert_eq!(20, arr_data.len());
2239 assert_eq!(10, arr_data.null_count());
2240 assert_eq!(5, arr_data.offset());
2241 assert_eq!(1, arr_data.buffers().len());
2242 assert_eq!(
2243 Buffer::from_slice_ref(&v).as_slice(),
2244 arr_data.buffers()[0].as_slice()
2245 );
2246 }
2247
2248 #[test]
2249 fn test_builder_with_child_data() {
2250 let child_arr_data = ArrayData::try_new(
2251 DataType::Int32,
2252 5,
2253 None,
2254 0,
2255 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2256 vec![],
2257 )
2258 .unwrap();
2259
2260 let field = Arc::new(Field::new("x", DataType::Int32, true));
2261 let data_type = DataType::Struct(vec![field].into());
2262
2263 let arr_data = ArrayData::builder(data_type)
2264 .len(5)
2265 .offset(0)
2266 .add_child_data(child_arr_data.clone())
2267 .build()
2268 .unwrap();
2269
2270 assert_eq!(5, arr_data.len());
2271 assert_eq!(1, arr_data.child_data().len());
2272 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2273 }
2274
2275 #[test]
2276 fn test_null_count() {
2277 let mut bit_v: [u8; 2] = [0; 2];
2278 bit_util::set_bit(&mut bit_v, 0);
2279 bit_util::set_bit(&mut bit_v, 3);
2280 bit_util::set_bit(&mut bit_v, 10);
2281 let arr_data = ArrayData::builder(DataType::Int32)
2282 .len(16)
2283 .add_buffer(make_i32_buffer(16))
2284 .null_bit_buffer(Some(Buffer::from(bit_v)))
2285 .build()
2286 .unwrap();
2287 assert_eq!(13, arr_data.null_count());
2288
2289 let mut bit_v: [u8; 2] = [0; 2];
2291 bit_util::set_bit(&mut bit_v, 0);
2292 bit_util::set_bit(&mut bit_v, 3);
2293 bit_util::set_bit(&mut bit_v, 10);
2294 let arr_data = ArrayData::builder(DataType::Int32)
2295 .len(12)
2296 .offset(2)
2297 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2299 .build()
2300 .unwrap();
2301 assert_eq!(10, arr_data.null_count());
2302 }
2303
2304 #[test]
2305 fn test_null_buffer_ref() {
2306 let mut bit_v: [u8; 2] = [0; 2];
2307 bit_util::set_bit(&mut bit_v, 0);
2308 bit_util::set_bit(&mut bit_v, 3);
2309 bit_util::set_bit(&mut bit_v, 10);
2310 let arr_data = ArrayData::builder(DataType::Int32)
2311 .len(16)
2312 .add_buffer(make_i32_buffer(16))
2313 .null_bit_buffer(Some(Buffer::from(bit_v)))
2314 .build()
2315 .unwrap();
2316 assert!(arr_data.nulls().is_some());
2317 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2318 }
2319
2320 #[test]
2321 fn test_slice() {
2322 let mut bit_v: [u8; 2] = [0; 2];
2323 bit_util::set_bit(&mut bit_v, 0);
2324 bit_util::set_bit(&mut bit_v, 3);
2325 bit_util::set_bit(&mut bit_v, 10);
2326 let data = ArrayData::builder(DataType::Int32)
2327 .len(16)
2328 .add_buffer(make_i32_buffer(16))
2329 .null_bit_buffer(Some(Buffer::from(bit_v)))
2330 .build()
2331 .unwrap();
2332 let new_data = data.slice(1, 15);
2333 assert_eq!(data.len() - 1, new_data.len());
2334 assert_eq!(1, new_data.offset());
2335 assert_eq!(data.null_count(), new_data.null_count());
2336
2337 let new_data = new_data.slice(1, 14);
2339 assert_eq!(data.len() - 2, new_data.len());
2340 assert_eq!(2, new_data.offset());
2341 assert_eq!(data.null_count() - 1, new_data.null_count());
2342 }
2343
2344 #[test]
2345 fn test_equality() {
2346 let int_data = ArrayData::builder(DataType::Int32)
2347 .len(1)
2348 .add_buffer(make_i32_buffer(1))
2349 .build()
2350 .unwrap();
2351
2352 let float_data = ArrayData::builder(DataType::Float32)
2353 .len(1)
2354 .add_buffer(make_f32_buffer(1))
2355 .build()
2356 .unwrap();
2357 assert_ne!(int_data, float_data);
2358 assert!(!int_data.ptr_eq(&float_data));
2359 assert!(int_data.ptr_eq(&int_data));
2360
2361 #[allow(clippy::redundant_clone)]
2362 let int_data_clone = int_data.clone();
2363 assert_eq!(int_data, int_data_clone);
2364 assert!(int_data.ptr_eq(&int_data_clone));
2365 assert!(int_data_clone.ptr_eq(&int_data));
2366
2367 let int_data_slice = int_data_clone.slice(1, 0);
2368 assert!(int_data_slice.ptr_eq(&int_data_slice));
2369 assert!(!int_data.ptr_eq(&int_data_slice));
2370 assert!(!int_data_slice.ptr_eq(&int_data));
2371
2372 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2373 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2374 let string_data = ArrayData::try_new(
2375 DataType::Utf8,
2376 3,
2377 Some(Buffer::from_iter(vec![true, false, true])),
2378 0,
2379 vec![offsets_buffer, data_buffer],
2380 vec![],
2381 )
2382 .unwrap();
2383
2384 assert_ne!(float_data, string_data);
2385 assert!(!float_data.ptr_eq(&string_data));
2386
2387 assert!(string_data.ptr_eq(&string_data));
2388
2389 #[allow(clippy::redundant_clone)]
2390 let string_data_cloned = string_data.clone();
2391 assert!(string_data_cloned.ptr_eq(&string_data));
2392 assert!(string_data.ptr_eq(&string_data_cloned));
2393
2394 let string_data_slice = string_data.slice(1, 2);
2395 assert!(string_data_slice.ptr_eq(&string_data_slice));
2396 assert!(!string_data_slice.ptr_eq(&string_data))
2397 }
2398
2399 #[test]
2400 fn test_slice_memory_size() {
2401 let mut bit_v: [u8; 2] = [0; 2];
2402 bit_util::set_bit(&mut bit_v, 0);
2403 bit_util::set_bit(&mut bit_v, 3);
2404 bit_util::set_bit(&mut bit_v, 10);
2405 let data = ArrayData::builder(DataType::Int32)
2406 .len(16)
2407 .add_buffer(make_i32_buffer(16))
2408 .null_bit_buffer(Some(Buffer::from(bit_v)))
2409 .build()
2410 .unwrap();
2411 let new_data = data.slice(1, 14);
2412 assert_eq!(
2413 data.get_slice_memory_size().unwrap() - 8,
2414 new_data.get_slice_memory_size().unwrap()
2415 );
2416 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2417 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2418 let string_data = ArrayData::try_new(
2419 DataType::Utf8,
2420 3,
2421 Some(Buffer::from_iter(vec![true, false, true])),
2422 0,
2423 vec![offsets_buffer, data_buffer],
2424 vec![],
2425 )
2426 .unwrap();
2427 let string_data_slice = string_data.slice(1, 2);
2428 assert_eq!(
2430 string_data.get_slice_memory_size().unwrap() - 6,
2431 string_data_slice.get_slice_memory_size().unwrap()
2432 );
2433 }
2434
2435 #[test]
2436 fn test_count_nulls() {
2437 let buffer = Buffer::from([0b00010110, 0b10011111]);
2438 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2439 let count = count_nulls(Some(&buffer), 0, 16);
2440 assert_eq!(count, 7);
2441
2442 let count = count_nulls(Some(&buffer), 4, 8);
2443 assert_eq!(count, 3);
2444 }
2445
2446 #[test]
2447 fn test_contains_nulls() {
2448 let buffer: Buffer =
2449 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2450 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2451 assert!(contains_nulls(Some(&buffer), 0, 6));
2452 assert!(contains_nulls(Some(&buffer), 0, 3));
2453 assert!(!contains_nulls(Some(&buffer), 3, 2));
2454 assert!(!contains_nulls(Some(&buffer), 0, 0));
2455 }
2456
2457 #[test]
2458 fn test_alignment() {
2459 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2460 let sliced = buffer.slice(1);
2461
2462 let mut data = ArrayData {
2463 data_type: DataType::Int32,
2464 len: 0,
2465 offset: 0,
2466 buffers: vec![buffer],
2467 child_data: vec![],
2468 nulls: None,
2469 };
2470 data.validate_full().unwrap();
2471
2472 data.buffers[0] = sliced;
2474 let err = data.validate().unwrap_err();
2475
2476 assert_eq!(
2477 err.to_string(),
2478 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2479 );
2480
2481 data.align_buffers();
2482 data.validate_full().unwrap();
2483 }
2484
2485 #[test]
2486 fn test_alignment_struct() {
2487 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2488 let sliced = buffer.slice(1);
2489
2490 let child_data = ArrayData {
2491 data_type: DataType::Int32,
2492 len: 0,
2493 offset: 0,
2494 buffers: vec![buffer],
2495 child_data: vec![],
2496 nulls: None,
2497 };
2498
2499 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2500 let mut data = ArrayData {
2501 data_type: schema,
2502 len: 0,
2503 offset: 0,
2504 buffers: vec![],
2505 child_data: vec![child_data],
2506 nulls: None,
2507 };
2508 data.validate_full().unwrap();
2509
2510 data.child_data[0].buffers[0] = sliced;
2512 let err = data.validate().unwrap_err();
2513
2514 assert_eq!(
2515 err.to_string(),
2516 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2517 );
2518
2519 data.align_buffers();
2520 data.validate_full().unwrap();
2521 }
2522
2523 #[test]
2524 fn test_null_view_types() {
2525 let array_len = 32;
2526 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2527 assert_eq!(array.len(), array_len);
2528 for i in 0..array.len() {
2529 assert!(array.is_null(i));
2530 }
2531
2532 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2533 assert_eq!(array.len(), array_len);
2534 for i in 0..array.len() {
2535 assert!(array.is_null(i));
2536 }
2537
2538 let array = ArrayData::new_null(
2539 &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2540 array_len,
2541 );
2542 assert_eq!(array.len(), array_len);
2543 for i in 0..array.len() {
2544 assert!(array.is_null(i));
2545 }
2546
2547 let array = ArrayData::new_null(
2548 &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2549 array_len,
2550 );
2551 assert_eq!(array.len(), array_len);
2552 for i in 0..array.len() {
2553 assert!(array.is_null(i));
2554 }
2555 }
2556}