1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 unsafe { skip_validation.set(true) };
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
316 data_type: DataType,
317 len: usize,
318 null_bit_buffer: Option<Buffer>,
319 offset: usize,
320 buffers: Vec<Buffer>,
321 child_data: Vec<ArrayData>,
322 ) -> Result<Self, ArrowError> {
323 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
327 let needed_len = bit_util::ceil(len + offset, 8);
328 if null_bit_buffer.len() < needed_len {
329 return Err(ArrowError::InvalidArgumentError(format!(
330 "null_bit_buffer size too small. got {} needed {}",
331 null_bit_buffer.len(),
332 needed_len
333 )));
334 }
335 }
336 let new_self = unsafe {
338 Self::new_unchecked(
339 data_type,
340 len,
341 None,
342 null_bit_buffer,
343 offset,
344 buffers,
345 child_data,
346 )
347 };
348
349 new_self.validate_data()?;
354 Ok(new_self)
355 }
356
357 pub fn into_parts(
363 self,
364 ) -> (
365 DataType,
366 usize,
367 Option<NullBuffer>,
368 usize,
369 Vec<Buffer>,
370 Vec<ArrayData>,
371 ) {
372 let Self {
373 data_type,
374 len,
375 nulls,
376 offset,
377 buffers,
378 child_data,
379 } = self;
380
381 (data_type, len, nulls, offset, buffers, child_data)
382 }
383
384 #[inline]
386 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
387 ArrayDataBuilder::new(data_type)
388 }
389
390 #[inline]
392 pub const fn data_type(&self) -> &DataType {
393 &self.data_type
394 }
395
396 pub fn buffers(&self) -> &[Buffer] {
398 &self.buffers
399 }
400
401 pub fn child_data(&self) -> &[ArrayData] {
404 &self.child_data[..]
405 }
406
407 #[inline]
409 pub fn is_null(&self, i: usize) -> bool {
410 match &self.nulls {
411 Some(v) => v.is_null(i),
412 None => false,
413 }
414 }
415
416 #[inline]
420 pub fn nulls(&self) -> Option<&NullBuffer> {
421 self.nulls.as_ref()
422 }
423
424 #[inline]
426 pub fn is_valid(&self, i: usize) -> bool {
427 !self.is_null(i)
428 }
429
430 #[inline]
432 pub const fn len(&self) -> usize {
433 self.len
434 }
435
436 #[inline]
438 pub const fn is_empty(&self) -> bool {
439 self.len == 0
440 }
441
442 #[inline]
444 pub const fn offset(&self) -> usize {
445 self.offset
446 }
447
448 #[inline]
450 pub fn null_count(&self) -> usize {
451 self.nulls
452 .as_ref()
453 .map(|x| x.null_count())
454 .unwrap_or_default()
455 }
456
457 pub fn get_buffer_memory_size(&self) -> usize {
469 let mut size = 0;
470 for buffer in &self.buffers {
471 size += buffer.capacity();
472 }
473 if let Some(bitmap) = &self.nulls {
474 size += bitmap.buffer().capacity()
475 }
476 for child in &self.child_data {
477 size += child.get_buffer_memory_size();
478 }
479 size
480 }
481
482 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
495 let mut result: usize = 0;
496 let layout = layout(&self.data_type);
497
498 for spec in layout.buffers.iter() {
499 match spec {
500 BufferSpec::FixedWidth { byte_width, .. } => {
501 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
502 ArrowError::ComputeError(
503 "Integer overflow computing buffer size".to_string(),
504 )
505 })?;
506 result += buffer_size;
507 }
508 BufferSpec::VariableWidth => {
509 let buffer_len = match self.data_type {
510 DataType::Utf8 | DataType::Binary => {
511 let offsets = self.typed_offsets::<i32>()?;
512 (offsets[self.len] - offsets[0]) as usize
513 }
514 DataType::LargeUtf8 | DataType::LargeBinary => {
515 let offsets = self.typed_offsets::<i64>()?;
516 (offsets[self.len] - offsets[0]) as usize
517 }
518 _ => {
519 return Err(ArrowError::NotYetImplemented(format!(
520 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
521 self.data_type
522 )));
523 }
524 };
525 result += buffer_len;
526 }
527 BufferSpec::BitMap => {
528 let buffer_size = bit_util::ceil(self.len, 8);
529 result += buffer_size;
530 }
531 BufferSpec::AlwaysNull => {
532 }
534 }
535 }
536
537 if self.nulls().is_some() {
538 result += bit_util::ceil(self.len, 8);
539 }
540
541 for child in &self.child_data {
542 result += child.get_slice_memory_size()?;
543 }
544 Ok(result)
545 }
546
547 pub fn get_array_memory_size(&self) -> usize {
556 let mut size = mem::size_of_val(self);
557
558 for buffer in &self.buffers {
560 size += mem::size_of::<Buffer>();
561 size += buffer.capacity();
562 }
563 if let Some(nulls) = &self.nulls {
564 size += nulls.buffer().capacity();
565 }
566 for child in &self.child_data {
567 size += child.get_array_memory_size();
568 }
569
570 size
571 }
572
573 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
581 assert!((offset + length) <= self.len());
582
583 if let DataType::Struct(_) = self.data_type() {
584 let new_offset = self.offset + offset;
586 ArrayData {
587 data_type: self.data_type().clone(),
588 len: length,
589 offset: new_offset,
590 buffers: self.buffers.clone(),
591 child_data: self
593 .child_data()
594 .iter()
595 .map(|data| data.slice(offset, length))
596 .collect(),
597 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
598 }
599 } else {
600 let mut new_data = self.clone();
601
602 new_data.len = length;
603 new_data.offset = offset + self.offset;
604 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
605
606 new_data
607 }
608 }
609
610 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
617 &self.buffers()[buffer].typed_data()[self.offset..]
618 }
619
620 pub fn new_null(data_type: &DataType, len: usize) -> Self {
622 let bit_len = bit_util::ceil(len, 8);
623 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
624
625 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
626 Some(width) => (vec![zeroed(width * len)], vec![], true),
627 None => match data_type {
628 DataType::Null => (vec![], vec![], false),
629 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
630 DataType::Binary | DataType::Utf8 => {
631 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
632 }
633 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
634 DataType::LargeBinary | DataType::LargeUtf8 => {
635 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
636 }
637 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
638 DataType::List(f) | DataType::Map(f, _) => (
639 vec![zeroed((len + 1) * 4)],
640 vec![ArrayData::new_empty(f.data_type())],
641 true,
642 ),
643 DataType::LargeList(f) => (
644 vec![zeroed((len + 1) * 8)],
645 vec![ArrayData::new_empty(f.data_type())],
646 true,
647 ),
648 DataType::ListView(f) => (
649 vec![zeroed(len * 4), zeroed(len * 4)],
650 vec![ArrayData::new_empty(f.data_type())],
651 true,
652 ),
653 DataType::LargeListView(f) => (
654 vec![zeroed(len * 8), zeroed(len * 8)],
655 vec![ArrayData::new_empty(f.data_type())],
656 true,
657 ),
658 DataType::FixedSizeList(f, list_len) => (
659 vec![],
660 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
661 true,
662 ),
663 DataType::Struct(fields) => (
664 vec![],
665 fields
666 .iter()
667 .map(|f| Self::new_null(f.data_type(), len))
668 .collect(),
669 true,
670 ),
671 DataType::Dictionary(k, v) => (
672 vec![zeroed(k.primitive_width().unwrap() * len)],
673 vec![ArrayData::new_empty(v.as_ref())],
674 true,
675 ),
676 DataType::Union(f, mode) => {
677 let (id, _) = f.iter().next().unwrap();
678 let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
679 let buffers = match mode {
680 UnionMode::Sparse => vec![ids],
681 UnionMode::Dense => {
682 let end_offset = i32::from_usize(len).unwrap();
683 vec![ids, Buffer::from_iter(0_i32..end_offset)]
684 }
685 };
686
687 let children = f
688 .iter()
689 .enumerate()
690 .map(|(idx, (_, f))| {
691 if idx == 0 || *mode == UnionMode::Sparse {
692 Self::new_null(f.data_type(), len)
693 } else {
694 Self::new_empty(f.data_type())
695 }
696 })
697 .collect();
698
699 (buffers, children, false)
700 }
701 DataType::RunEndEncoded(r, v) => {
702 if len == 0 {
703 let runs = ArrayData::new_empty(r.data_type());
705 let values = ArrayData::new_empty(v.data_type());
706 (vec![], vec![runs, values], false)
707 } else {
708 let runs = match r.data_type() {
709 DataType::Int16 => {
710 let i = i16::from_usize(len).expect("run overflow");
711 Buffer::from_slice_ref([i])
712 }
713 DataType::Int32 => {
714 let i = i32::from_usize(len).expect("run overflow");
715 Buffer::from_slice_ref([i])
716 }
717 DataType::Int64 => {
718 let i = i64::from_usize(len).expect("run overflow");
719 Buffer::from_slice_ref([i])
720 }
721 dt => unreachable!("Invalid run ends data type {dt}"),
722 };
723
724 let builder = ArrayData::builder(r.data_type().clone())
725 .len(1)
726 .buffers(vec![runs]);
727
728 let runs = unsafe { builder.build_unchecked() };
731 (
732 vec![],
733 vec![runs, ArrayData::new_null(v.data_type(), 1)],
734 false,
735 )
736 }
737 }
738 DataType::Int8
740 | DataType::Int16
741 | DataType::Int32
742 | DataType::Int64
743 | DataType::UInt8
744 | DataType::UInt16
745 | DataType::UInt32
746 | DataType::UInt64
747 | DataType::Float16
748 | DataType::Float32
749 | DataType::Float64
750 | DataType::Timestamp(_, _)
751 | DataType::Date32
752 | DataType::Date64
753 | DataType::Time32(_)
754 | DataType::Time64(_)
755 | DataType::Duration(_)
756 | DataType::Interval(_)
757 | DataType::Decimal32(_, _)
758 | DataType::Decimal64(_, _)
759 | DataType::Decimal128(_, _)
760 | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
761 },
762 };
763
764 let mut builder = ArrayDataBuilder::new(data_type.clone())
765 .len(len)
766 .buffers(buffers)
767 .child_data(child_data);
768
769 if has_nulls {
770 builder = builder.nulls(Some(NullBuffer::new_null(len)))
771 }
772
773 unsafe { builder.build_unchecked() }
776 }
777
778 pub fn new_empty(data_type: &DataType) -> Self {
780 Self::new_null(data_type, 0)
781 }
782
783 pub fn align_buffers(&mut self) {
792 let layout = layout(&self.data_type);
793 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
794 if let BufferSpec::FixedWidth { alignment, .. } = spec {
795 if buffer.as_ptr().align_offset(*alignment) != 0 {
796 *buffer = Buffer::from_slice_ref(buffer.as_ref());
797 }
798 }
799 }
800 for data in self.child_data.iter_mut() {
802 data.align_buffers()
803 }
804 }
805
806 pub fn validate(&self) -> Result<(), ArrowError> {
817 let len_plus_offset = self.len + self.offset;
819
820 let layout = layout(&self.data_type);
822
823 if !layout.can_contain_null_mask && self.nulls.is_some() {
824 return Err(ArrowError::InvalidArgumentError(format!(
825 "Arrays of type {:?} cannot contain a null bitmask",
826 self.data_type,
827 )));
828 }
829
830 if self.buffers.len() < layout.buffers.len()
832 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
833 {
834 return Err(ArrowError::InvalidArgumentError(format!(
835 "Expected {} buffers in array of type {:?}, got {}",
836 layout.buffers.len(),
837 self.data_type,
838 self.buffers.len(),
839 )));
840 }
841
842 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
843 match spec {
844 BufferSpec::FixedWidth {
845 byte_width,
846 alignment,
847 } => {
848 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
849
850 if buffer.len() < min_buffer_size {
851 return Err(ArrowError::InvalidArgumentError(format!(
852 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
853 min_buffer_size,
854 i,
855 self.data_type,
856 buffer.len()
857 )));
858 }
859
860 let align_offset = buffer.as_ptr().align_offset(*alignment);
861 if align_offset != 0 {
862 return Err(ArrowError::InvalidArgumentError(format!(
863 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
864 self.data_type,
865 align_offset.min(alignment - align_offset)
866 )));
867 }
868 }
869 BufferSpec::VariableWidth => {
870 }
874 BufferSpec::BitMap => {
875 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
876 if buffer.len() < min_buffer_size {
877 return Err(ArrowError::InvalidArgumentError(format!(
878 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
879 min_buffer_size,
880 i,
881 self.data_type,
882 buffer.len()
883 )));
884 }
885 }
886 BufferSpec::AlwaysNull => {
887 }
889 }
890 }
891
892 if let Some(nulls) = self.nulls() {
894 if nulls.null_count() > self.len {
895 return Err(ArrowError::InvalidArgumentError(format!(
896 "null_count {} for an array exceeds length of {} elements",
897 nulls.null_count(),
898 self.len
899 )));
900 }
901
902 let actual_len = nulls.validity().len();
903 let needed_len = bit_util::ceil(len_plus_offset, 8);
904 if actual_len < needed_len {
905 return Err(ArrowError::InvalidArgumentError(format!(
906 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
907 )));
908 }
909
910 if nulls.len() != self.len {
911 return Err(ArrowError::InvalidArgumentError(format!(
912 "null buffer incorrect size. got {} expected {}",
913 nulls.len(),
914 self.len
915 )));
916 }
917 }
918
919 self.validate_child_data()?;
920
921 match &self.data_type {
923 DataType::Utf8 | DataType::Binary => {
924 self.validate_offsets::<i32>(self.buffers[1].len())?;
925 }
926 DataType::LargeUtf8 | DataType::LargeBinary => {
927 self.validate_offsets::<i64>(self.buffers[1].len())?;
928 }
929 DataType::Dictionary(key_type, _value_type) => {
930 if !DataType::is_dictionary_key_type(key_type) {
932 return Err(ArrowError::InvalidArgumentError(format!(
933 "Dictionary key type must be integer, but was {key_type}"
934 )));
935 }
936 }
937 DataType::RunEndEncoded(run_ends_type, _) => {
938 if run_ends_type.is_nullable() {
939 return Err(ArrowError::InvalidArgumentError(
940 "The nullable should be set to false for the field defining run_ends array.".to_string()
941 ));
942 }
943 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
944 return Err(ArrowError::InvalidArgumentError(format!(
945 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
946 run_ends_type.data_type()
947 )));
948 }
949 }
950 _ => {}
951 };
952
953 Ok(())
954 }
955
956 fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
963 if self.len == 0 && self.buffers[0].is_empty() {
965 return Ok(&[]);
966 }
967
968 self.typed_buffer(0, self.len + 1)
969 }
970
971 fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
973 &self,
974 idx: usize,
975 len: usize,
976 ) -> Result<&[T], ArrowError> {
977 let buffer = &self.buffers[idx];
978
979 let required_len = (len + self.offset) * mem::size_of::<T>();
980
981 if buffer.len() < required_len {
982 return Err(ArrowError::InvalidArgumentError(format!(
983 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
984 idx,
985 self.data_type,
986 required_len,
987 buffer.len()
988 )));
989 }
990
991 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
992 }
993
994 fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
997 &self,
998 values_length: usize,
999 ) -> Result<(), ArrowError> {
1000 let offsets = self.typed_offsets::<T>()?;
1002 if offsets.is_empty() {
1003 return Ok(());
1004 }
1005
1006 let first_offset = offsets[0].to_usize().ok_or_else(|| {
1007 ArrowError::InvalidArgumentError(format!(
1008 "Error converting offset[0] ({}) to usize for {}",
1009 offsets[0], self.data_type
1010 ))
1011 })?;
1012
1013 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1014 ArrowError::InvalidArgumentError(format!(
1015 "Error converting offset[{}] ({}) to usize for {}",
1016 self.len, offsets[self.len], self.data_type
1017 ))
1018 })?;
1019
1020 if first_offset > values_length {
1021 return Err(ArrowError::InvalidArgumentError(format!(
1022 "First offset {} of {} is larger than values length {}",
1023 first_offset, self.data_type, values_length,
1024 )));
1025 }
1026
1027 if last_offset > values_length {
1028 return Err(ArrowError::InvalidArgumentError(format!(
1029 "Last offset {} of {} is larger than values length {}",
1030 last_offset, self.data_type, values_length,
1031 )));
1032 }
1033
1034 if first_offset > last_offset {
1035 return Err(ArrowError::InvalidArgumentError(format!(
1036 "First offset {} in {} is smaller than last offset {}",
1037 first_offset, self.data_type, last_offset,
1038 )));
1039 }
1040
1041 Ok(())
1042 }
1043
1044 fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1047 &self,
1048 values_length: usize,
1049 ) -> Result<(), ArrowError> {
1050 let offsets: &[T] = self.typed_buffer(0, self.len)?;
1051 let sizes: &[T] = self.typed_buffer(1, self.len)?;
1052 if offsets.len() != sizes.len() {
1053 return Err(ArrowError::ComputeError(format!(
1054 "ListView offsets len {} does not match sizes len {}",
1055 offsets.len(),
1056 sizes.len()
1057 )));
1058 }
1059
1060 for i in 0..sizes.len() {
1061 let size = sizes[i].to_usize().ok_or_else(|| {
1062 ArrowError::InvalidArgumentError(format!(
1063 "Error converting size[{}] ({}) to usize for {}",
1064 i, sizes[i], self.data_type
1065 ))
1066 })?;
1067 let offset = offsets[i].to_usize().ok_or_else(|| {
1068 ArrowError::InvalidArgumentError(format!(
1069 "Error converting offset[{}] ({}) to usize for {}",
1070 i, offsets[i], self.data_type
1071 ))
1072 })?;
1073 if size
1074 .checked_add(offset)
1075 .expect("Offset and size have exceeded the usize boundary")
1076 > values_length
1077 {
1078 return Err(ArrowError::InvalidArgumentError(format!(
1079 "Size {} at index {} is larger than the remaining values for {}",
1080 size, i, self.data_type
1081 )));
1082 }
1083 }
1084 Ok(())
1085 }
1086
1087 fn validate_child_data(&self) -> Result<(), ArrowError> {
1089 match &self.data_type {
1090 DataType::List(field) | DataType::Map(field, _) => {
1091 let values_data = self.get_single_valid_child_data(field.data_type())?;
1092 self.validate_offsets::<i32>(values_data.len)?;
1093 Ok(())
1094 }
1095 DataType::LargeList(field) => {
1096 let values_data = self.get_single_valid_child_data(field.data_type())?;
1097 self.validate_offsets::<i64>(values_data.len)?;
1098 Ok(())
1099 }
1100 DataType::ListView(field) => {
1101 let values_data = self.get_single_valid_child_data(field.data_type())?;
1102 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1103 Ok(())
1104 }
1105 DataType::LargeListView(field) => {
1106 let values_data = self.get_single_valid_child_data(field.data_type())?;
1107 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1108 Ok(())
1109 }
1110 DataType::FixedSizeList(field, list_size) => {
1111 let values_data = self.get_single_valid_child_data(field.data_type())?;
1112
1113 let list_size: usize = (*list_size).try_into().map_err(|_| {
1114 ArrowError::InvalidArgumentError(format!(
1115 "{} has a negative list_size {}",
1116 self.data_type, list_size
1117 ))
1118 })?;
1119
1120 let expected_values_len = self.len
1121 .checked_mul(list_size)
1122 .expect("integer overflow computing expected number of expected values in FixedListSize");
1123
1124 if values_data.len < expected_values_len {
1125 return Err(ArrowError::InvalidArgumentError(format!(
1126 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1127 values_data.len, self.len, list_size, self.data_type
1128 )));
1129 }
1130
1131 Ok(())
1132 }
1133 DataType::Struct(fields) => {
1134 self.validate_num_child_data(fields.len())?;
1135 for (i, field) in fields.iter().enumerate() {
1136 let field_data = self.get_valid_child_data(i, field.data_type())?;
1137
1138 if field_data.len < self.len {
1140 return Err(ArrowError::InvalidArgumentError(format!(
1141 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1142 self.data_type,
1143 i,
1144 field.name(),
1145 field_data.len,
1146 self.len
1147 )));
1148 }
1149 }
1150 Ok(())
1151 }
1152 DataType::RunEndEncoded(run_ends_field, values_field) => {
1153 self.validate_num_child_data(2)?;
1154 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1155 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1156 if run_ends_data.len != values_data.len {
1157 return Err(ArrowError::InvalidArgumentError(format!(
1158 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1159 run_ends_data.len, values_data.len
1160 )));
1161 }
1162 if run_ends_data.nulls.is_some() {
1163 return Err(ArrowError::InvalidArgumentError(
1164 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1165 ));
1166 }
1167 Ok(())
1168 }
1169 DataType::Union(fields, mode) => {
1170 self.validate_num_child_data(fields.len())?;
1171
1172 for (i, (_, field)) in fields.iter().enumerate() {
1173 let field_data = self.get_valid_child_data(i, field.data_type())?;
1174
1175 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1176 return Err(ArrowError::InvalidArgumentError(format!(
1177 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1178 i,
1179 field_data.len,
1180 self.len + self.offset
1181 )));
1182 }
1183 }
1184 Ok(())
1185 }
1186 DataType::Dictionary(_key_type, value_type) => {
1187 self.get_single_valid_child_data(value_type)?;
1188 Ok(())
1189 }
1190 _ => {
1191 if !self.child_data.is_empty() {
1193 return Err(ArrowError::InvalidArgumentError(format!(
1194 "Expected no child arrays for type {} but got {}",
1195 self.data_type,
1196 self.child_data.len()
1197 )));
1198 }
1199 Ok(())
1200 }
1201 }
1202 }
1203
1204 fn get_single_valid_child_data(
1208 &self,
1209 expected_type: &DataType,
1210 ) -> Result<&ArrayData, ArrowError> {
1211 self.validate_num_child_data(1)?;
1212 self.get_valid_child_data(0, expected_type)
1213 }
1214
1215 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1217 if self.child_data.len() != expected_len {
1218 Err(ArrowError::InvalidArgumentError(format!(
1219 "Value data for {} should contain {} child data array(s), had {}",
1220 self.data_type,
1221 expected_len,
1222 self.child_data.len()
1223 )))
1224 } else {
1225 Ok(())
1226 }
1227 }
1228
1229 fn get_valid_child_data(
1232 &self,
1233 i: usize,
1234 expected_type: &DataType,
1235 ) -> Result<&ArrayData, ArrowError> {
1236 let values_data = self.child_data.get(i).ok_or_else(|| {
1237 ArrowError::InvalidArgumentError(format!(
1238 "{} did not have enough child arrays. Expected at least {} but had only {}",
1239 self.data_type,
1240 i + 1,
1241 self.child_data.len()
1242 ))
1243 })?;
1244
1245 if expected_type != &values_data.data_type {
1246 return Err(ArrowError::InvalidArgumentError(format!(
1247 "Child type mismatch for {}. Expected {} but child data had {}",
1248 self.data_type, expected_type, values_data.data_type
1249 )));
1250 }
1251
1252 values_data.validate()?;
1253 Ok(values_data)
1254 }
1255
1256 pub fn validate_data(&self) -> Result<(), ArrowError> {
1272 self.validate()?;
1273
1274 self.validate_nulls()?;
1275 self.validate_values()?;
1276 Ok(())
1277 }
1278
1279 pub fn validate_full(&self) -> Result<(), ArrowError> {
1284 self.validate_data()?;
1285 self.child_data
1287 .iter()
1288 .enumerate()
1289 .try_for_each(|(i, child_data)| {
1290 child_data.validate_full().map_err(|e| {
1291 ArrowError::InvalidArgumentError(format!(
1292 "{} child #{} invalid: {}",
1293 self.data_type, i, e
1294 ))
1295 })
1296 })?;
1297 Ok(())
1298 }
1299
1300 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1310 if let Some(nulls) = &self.nulls {
1311 let actual = nulls.len() - nulls.inner().count_set_bits();
1312 if actual != nulls.null_count() {
1313 return Err(ArrowError::InvalidArgumentError(format!(
1314 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1315 nulls.null_count(),
1316 actual
1317 )));
1318 }
1319 }
1320
1321 match &self.data_type {
1326 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1327 if !f.is_nullable() {
1328 self.validate_non_nullable(None, &self.child_data[0])?
1329 }
1330 }
1331 DataType::FixedSizeList(field, len) => {
1332 let child = &self.child_data[0];
1333 if !field.is_nullable() {
1334 match &self.nulls {
1335 Some(nulls) => {
1336 let element_len = *len as usize;
1337 let expanded = nulls.expand(element_len);
1338 self.validate_non_nullable(Some(&expanded), child)?;
1339 }
1340 None => self.validate_non_nullable(None, child)?,
1341 }
1342 }
1343 }
1344 DataType::Struct(fields) => {
1345 for (field, child) in fields.iter().zip(&self.child_data) {
1346 if !field.is_nullable() {
1347 self.validate_non_nullable(self.nulls(), child)?
1348 }
1349 }
1350 }
1351 _ => {}
1352 }
1353
1354 Ok(())
1355 }
1356
1357 fn validate_non_nullable(
1359 &self,
1360 mask: Option<&NullBuffer>,
1361 child: &ArrayData,
1362 ) -> Result<(), ArrowError> {
1363 let mask = match mask {
1364 Some(mask) => mask,
1365 None => {
1366 return match child.null_count() {
1367 0 => Ok(()),
1368 _ => Err(ArrowError::InvalidArgumentError(format!(
1369 "non-nullable child of type {} contains nulls not present in parent {}",
1370 child.data_type, self.data_type
1371 ))),
1372 };
1373 }
1374 };
1375
1376 match child.nulls() {
1377 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1378 "non-nullable child of type {} contains nulls not present in parent",
1379 child.data_type
1380 ))),
1381 _ => Ok(()),
1382 }
1383 }
1384
1385 pub fn validate_values(&self) -> Result<(), ArrowError> {
1391 match &self.data_type {
1392 DataType::Utf8 => self.validate_utf8::<i32>(),
1393 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1394 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1395 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1396 DataType::BinaryView => {
1397 let views = self.typed_buffer::<u128>(0, self.len)?;
1398 validate_binary_view(views, &self.buffers[1..])
1399 }
1400 DataType::Utf8View => {
1401 let views = self.typed_buffer::<u128>(0, self.len)?;
1402 validate_string_view(views, &self.buffers[1..])
1403 }
1404 DataType::List(_) | DataType::Map(_, _) => {
1405 let child = &self.child_data[0];
1406 self.validate_offsets_full::<i32>(child.len)
1407 }
1408 DataType::LargeList(_) => {
1409 let child = &self.child_data[0];
1410 self.validate_offsets_full::<i64>(child.len)
1411 }
1412 DataType::Union(_, _) => {
1413 Ok(())
1419 }
1420 DataType::Dictionary(key_type, _value_type) => {
1421 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1422 let max_value = dictionary_length - 1;
1423 match key_type.as_ref() {
1424 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1425 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1426 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1427 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1428 DataType::Int8 => self.check_bounds::<i8>(max_value),
1429 DataType::Int16 => self.check_bounds::<i16>(max_value),
1430 DataType::Int32 => self.check_bounds::<i32>(max_value),
1431 DataType::Int64 => self.check_bounds::<i64>(max_value),
1432 _ => unreachable!(),
1433 }
1434 }
1435 DataType::RunEndEncoded(run_ends, _values) => {
1436 let run_ends_data = self.child_data()[0].clone();
1437 match run_ends.data_type() {
1438 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1439 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1440 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1441 _ => unreachable!(),
1442 }
1443 }
1444 _ => {
1445 Ok(())
1447 }
1448 }
1449 }
1450
1451 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1462 where
1463 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1464 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1465 {
1466 self.typed_offsets::<T>()?
1467 .iter()
1468 .enumerate()
1469 .map(|(i, x)| {
1470 let r = x.to_usize().ok_or_else(|| {
1472 ArrowError::InvalidArgumentError(format!(
1473 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1474 );
1475 match r {
1477 Ok(n) if n <= offset_limit => Ok((i, n)),
1478 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1479 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1480 ),
1481 Err(e) => Err(e),
1482 }
1483 })
1484 .scan(0_usize, |start, end| {
1485 match end {
1487 Ok((i, end)) if *start <= end => {
1488 let range = Some(Ok((i, *start..end)));
1489 *start = end;
1490 range
1491 }
1492 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1493 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1494 i - 1, start, end))
1495 )),
1496 Err(err) => Some(Err(err)),
1497 }
1498 })
1499 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1501 let (item_index, range) = res?;
1502 validate(item_index-1, range)
1503 })
1504 }
1505
1506 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1509 where
1510 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1511 {
1512 let values_buffer = &self.buffers[1].as_slice();
1513 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1514 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1516 if !values_str.is_char_boundary(range.start)
1517 || !values_str.is_char_boundary(range.end)
1518 {
1519 return Err(ArrowError::InvalidArgumentError(format!(
1520 "incomplete utf-8 byte sequence from index {string_index}"
1521 )));
1522 }
1523 Ok(())
1524 })
1525 } else {
1526 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1528 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1529 ArrowError::InvalidArgumentError(format!(
1530 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1531 ))
1532 })?;
1533 Ok(())
1534 })
1535 }
1536 }
1537
1538 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1541 where
1542 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1543 {
1544 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1545 Ok(())
1548 })
1549 }
1550
1551 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1554 where
1555 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1556 {
1557 let required_len = self.len + self.offset;
1558 let buffer = &self.buffers[0];
1559
1560 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1563
1564 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1566
1567 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1568 if self.is_null(i) {
1570 return Ok(());
1571 }
1572 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1573 ArrowError::InvalidArgumentError(format!(
1574 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1575 ))
1576 })?;
1577
1578 if dict_index < 0 || dict_index > max_value {
1579 return Err(ArrowError::InvalidArgumentError(format!(
1580 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1581 )));
1582 }
1583 Ok(())
1584 })
1585 }
1586
1587 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1589 where
1590 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1591 {
1592 let values = self.typed_buffer::<T>(0, self.len)?;
1593 let mut prev_value: i64 = 0_i64;
1594 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1595 let value: i64 = inp_value.try_into().map_err(|_| {
1596 ArrowError::InvalidArgumentError(format!(
1597 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1598 ))
1599 })?;
1600 if value <= 0_i64 {
1601 return Err(ArrowError::InvalidArgumentError(format!(
1602 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1603 )));
1604 }
1605 if ix > 0 && value <= prev_value {
1606 return Err(ArrowError::InvalidArgumentError(format!(
1607 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1608 )));
1609 }
1610
1611 prev_value = value;
1612 Ok(())
1613 })?;
1614
1615 if prev_value.as_usize() < (self.offset + self.len) {
1616 return Err(ArrowError::InvalidArgumentError(format!(
1617 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1618 self.offset + self.len
1619 )));
1620 }
1621 Ok(())
1622 }
1623
1624 pub fn ptr_eq(&self, other: &Self) -> bool {
1628 if self.offset != other.offset
1629 || self.len != other.len
1630 || self.data_type != other.data_type
1631 || self.buffers.len() != other.buffers.len()
1632 || self.child_data.len() != other.child_data.len()
1633 {
1634 return false;
1635 }
1636
1637 match (&self.nulls, &other.nulls) {
1638 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1639 (Some(_), None) | (None, Some(_)) => return false,
1640 _ => {}
1641 };
1642
1643 if !self
1644 .buffers
1645 .iter()
1646 .zip(other.buffers.iter())
1647 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1648 {
1649 return false;
1650 }
1651
1652 self.child_data
1653 .iter()
1654 .zip(other.child_data.iter())
1655 .all(|(a, b)| a.ptr_eq(b))
1656 }
1657
1658 pub fn into_builder(self) -> ArrayDataBuilder {
1660 self.into()
1661 }
1662}
1663
1664pub fn layout(data_type: &DataType) -> DataTypeLayout {
1667 use arrow_schema::IntervalUnit::*;
1670
1671 match data_type {
1672 DataType::Null => DataTypeLayout {
1673 buffers: vec![],
1674 can_contain_null_mask: false,
1675 variadic: false,
1676 },
1677 DataType::Boolean => DataTypeLayout {
1678 buffers: vec![BufferSpec::BitMap],
1679 can_contain_null_mask: true,
1680 variadic: false,
1681 },
1682 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1683 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1684 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1685 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1686 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1687 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1688 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1689 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1690 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1691 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1692 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1693 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1694 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1695 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1696 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1697 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1698 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1699 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1700 DataType::Interval(MonthDayNano) => {
1701 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1702 }
1703 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1704 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1705 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1706 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1707 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1708 DataType::FixedSizeBinary(size) => {
1709 let spec = BufferSpec::FixedWidth {
1710 byte_width: (*size).try_into().unwrap(),
1711 alignment: mem::align_of::<u8>(),
1712 };
1713 DataTypeLayout {
1714 buffers: vec![spec],
1715 can_contain_null_mask: true,
1716 variadic: false,
1717 }
1718 }
1719 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1720 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1721 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1722 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1723 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1724 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1726 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1727 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1728 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1729 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1730 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1733 let type_ids = BufferSpec::FixedWidth {
1734 byte_width: mem::size_of::<i8>(),
1735 alignment: mem::align_of::<i8>(),
1736 };
1737
1738 DataTypeLayout {
1739 buffers: match mode {
1740 UnionMode::Sparse => {
1741 vec![type_ids]
1742 }
1743 UnionMode::Dense => {
1744 vec![
1745 type_ids,
1746 BufferSpec::FixedWidth {
1747 byte_width: mem::size_of::<i32>(),
1748 alignment: mem::align_of::<i32>(),
1749 },
1750 ]
1751 }
1752 },
1753 can_contain_null_mask: false,
1754 variadic: false,
1755 }
1756 }
1757 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1758 }
1759}
1760
1761#[derive(Debug, PartialEq, Eq)]
1763pub struct DataTypeLayout {
1765 pub buffers: Vec<BufferSpec>,
1767
1768 pub can_contain_null_mask: bool,
1770
1771 pub variadic: bool,
1775}
1776
1777impl DataTypeLayout {
1778 pub fn new_fixed_width<T>() -> Self {
1780 Self {
1781 buffers: vec![BufferSpec::FixedWidth {
1782 byte_width: mem::size_of::<T>(),
1783 alignment: mem::align_of::<T>(),
1784 }],
1785 can_contain_null_mask: true,
1786 variadic: false,
1787 }
1788 }
1789
1790 pub fn new_nullable_empty() -> Self {
1793 Self {
1794 buffers: vec![],
1795 can_contain_null_mask: true,
1796 variadic: false,
1797 }
1798 }
1799
1800 pub fn new_empty() -> Self {
1803 Self {
1804 buffers: vec![],
1805 can_contain_null_mask: false,
1806 variadic: false,
1807 }
1808 }
1809
1810 pub fn new_binary<T>() -> Self {
1814 Self {
1815 buffers: vec![
1816 BufferSpec::FixedWidth {
1818 byte_width: mem::size_of::<T>(),
1819 alignment: mem::align_of::<T>(),
1820 },
1821 BufferSpec::VariableWidth,
1823 ],
1824 can_contain_null_mask: true,
1825 variadic: false,
1826 }
1827 }
1828
1829 pub fn new_view() -> Self {
1831 Self {
1832 buffers: vec![BufferSpec::FixedWidth {
1833 byte_width: mem::size_of::<u128>(),
1834 alignment: mem::align_of::<u128>(),
1835 }],
1836 can_contain_null_mask: true,
1837 variadic: true,
1838 }
1839 }
1840
1841 pub fn new_list_view<T>() -> Self {
1843 Self {
1844 buffers: vec![
1845 BufferSpec::FixedWidth {
1846 byte_width: mem::size_of::<T>(),
1847 alignment: mem::align_of::<T>(),
1848 },
1849 BufferSpec::FixedWidth {
1850 byte_width: mem::size_of::<T>(),
1851 alignment: mem::align_of::<T>(),
1852 },
1853 ],
1854 can_contain_null_mask: true,
1855 variadic: false,
1856 }
1857 }
1858}
1859
1860#[derive(Debug, PartialEq, Eq)]
1862pub enum BufferSpec {
1863 FixedWidth {
1874 byte_width: usize,
1876 alignment: usize,
1878 },
1879 VariableWidth,
1881 BitMap,
1887 #[allow(dead_code)]
1890 AlwaysNull,
1891}
1892
1893impl PartialEq for ArrayData {
1894 fn eq(&self, other: &Self) -> bool {
1895 equal::equal(self, other)
1896 }
1897}
1898
1899#[derive(Debug, Clone)]
1918#[doc(hidden)]
1919pub struct UnsafeFlag(bool);
1920
1921impl UnsafeFlag {
1922 #[inline]
1926 pub const fn new() -> Self {
1927 Self(false)
1928 }
1929
1930 #[inline]
1940 pub unsafe fn set(&mut self, val: bool) {
1941 self.0 = val;
1942 }
1943
1944 #[inline]
1946 pub fn get(&self) -> bool {
1947 self.0
1948 }
1949}
1950
1951impl Default for UnsafeFlag {
1953 fn default() -> Self {
1954 Self::new()
1955 }
1956}
1957
1958#[derive(Debug)]
1960pub struct ArrayDataBuilder {
1961 data_type: DataType,
1962 len: usize,
1963 null_count: Option<usize>,
1964 null_bit_buffer: Option<Buffer>,
1965 nulls: Option<NullBuffer>,
1966 offset: usize,
1967 buffers: Vec<Buffer>,
1968 child_data: Vec<ArrayData>,
1969 align_buffers: bool,
1973 skip_validation: UnsafeFlag,
1983}
1984
1985impl ArrayDataBuilder {
1986 #[inline]
1987 pub const fn new(data_type: DataType) -> Self {
1989 Self {
1990 data_type,
1991 len: 0,
1992 null_count: None,
1993 null_bit_buffer: None,
1994 nulls: None,
1995 offset: 0,
1996 buffers: vec![],
1997 child_data: vec![],
1998 align_buffers: false,
1999 skip_validation: UnsafeFlag::new(),
2000 }
2001 }
2002
2003 pub fn data_type(self, data_type: DataType) -> Self {
2005 Self { data_type, ..self }
2006 }
2007
2008 #[inline]
2009 #[allow(clippy::len_without_is_empty)]
2010 pub const fn len(mut self, n: usize) -> Self {
2012 self.len = n;
2013 self
2014 }
2015
2016 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2018 self.nulls = nulls;
2019 self.null_count = None;
2020 self.null_bit_buffer = None;
2021 self
2022 }
2023
2024 pub fn null_count(mut self, null_count: usize) -> Self {
2026 self.null_count = Some(null_count);
2027 self
2028 }
2029
2030 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2032 self.nulls = None;
2033 self.null_bit_buffer = buf;
2034 self
2035 }
2036
2037 #[inline]
2039 pub const fn offset(mut self, n: usize) -> Self {
2040 self.offset = n;
2041 self
2042 }
2043
2044 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2046 self.buffers = v;
2047 self
2048 }
2049
2050 pub fn add_buffer(mut self, b: Buffer) -> Self {
2052 self.buffers.push(b);
2053 self
2054 }
2055
2056 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2058 self.buffers.extend(bs);
2059 self
2060 }
2061
2062 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2064 self.child_data = v;
2065 self
2066 }
2067
2068 pub fn add_child_data(mut self, r: ArrayData) -> Self {
2070 self.child_data.push(r);
2071 self
2072 }
2073
2074 pub unsafe fn build_unchecked(self) -> ArrayData {
2090 unsafe { self.skip_validation(true) }.build().unwrap()
2091 }
2092
2093 pub fn build(self) -> Result<ArrayData, ArrowError> {
2102 let Self {
2103 data_type,
2104 len,
2105 null_count,
2106 null_bit_buffer,
2107 nulls,
2108 offset,
2109 buffers,
2110 child_data,
2111 align_buffers,
2112 skip_validation,
2113 } = self;
2114
2115 let nulls = nulls
2116 .or_else(|| {
2117 let buffer = null_bit_buffer?;
2118 let buffer = BooleanBuffer::new(buffer, offset, len);
2119 Some(match null_count {
2120 Some(n) => {
2121 unsafe { NullBuffer::new_unchecked(buffer, n) }
2123 }
2124 None => NullBuffer::new(buffer),
2125 })
2126 })
2127 .filter(|b| b.null_count() != 0);
2128
2129 let mut data = ArrayData {
2130 data_type,
2131 len,
2132 offset,
2133 buffers,
2134 child_data,
2135 nulls,
2136 };
2137
2138 if align_buffers {
2139 data.align_buffers();
2140 }
2141
2142 if !skip_validation.get() || cfg!(feature = "force_validate") {
2144 data.validate_data()?;
2145 }
2146 Ok(data)
2147 }
2148
2149 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2151 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2152 self.align_buffers(true).build()
2153 }
2154
2155 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2171 self.align_buffers = align_buffers;
2172 self
2173 }
2174
2175 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2189 unsafe {
2190 self.skip_validation.set(skip_validation);
2191 }
2192 self
2193 }
2194}
2195
2196impl From<ArrayData> for ArrayDataBuilder {
2197 fn from(d: ArrayData) -> Self {
2198 Self {
2199 data_type: d.data_type,
2200 len: d.len,
2201 offset: d.offset,
2202 buffers: d.buffers,
2203 child_data: d.child_data,
2204 nulls: d.nulls,
2205 null_bit_buffer: None,
2206 null_count: None,
2207 align_buffers: false,
2208 skip_validation: UnsafeFlag::new(),
2209 }
2210 }
2211}
2212
2213#[cfg(test)]
2214mod tests {
2215 use super::*;
2216 use arrow_schema::{Field, Fields};
2217
2218 fn make_i32_buffer(n: usize) -> Buffer {
2222 Buffer::from_slice_ref(vec![42i32; n])
2223 }
2224
2225 fn make_f32_buffer(n: usize) -> Buffer {
2227 Buffer::from_slice_ref(vec![42f32; n])
2228 }
2229
2230 #[test]
2231 fn test_builder() {
2232 let v = (0..25).collect::<Vec<i32>>();
2234 let b1 = Buffer::from_slice_ref(&v);
2235 let arr_data = ArrayData::builder(DataType::Int32)
2236 .len(20)
2237 .offset(5)
2238 .add_buffer(b1)
2239 .null_bit_buffer(Some(Buffer::from([
2240 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2241 ])))
2242 .build()
2243 .unwrap();
2244
2245 assert_eq!(20, arr_data.len());
2246 assert_eq!(10, arr_data.null_count());
2247 assert_eq!(5, arr_data.offset());
2248 assert_eq!(1, arr_data.buffers().len());
2249 assert_eq!(
2250 Buffer::from_slice_ref(&v).as_slice(),
2251 arr_data.buffers()[0].as_slice()
2252 );
2253 }
2254
2255 #[test]
2256 fn test_builder_with_child_data() {
2257 let child_arr_data = ArrayData::try_new(
2258 DataType::Int32,
2259 5,
2260 None,
2261 0,
2262 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2263 vec![],
2264 )
2265 .unwrap();
2266
2267 let field = Arc::new(Field::new("x", DataType::Int32, true));
2268 let data_type = DataType::Struct(vec![field].into());
2269
2270 let arr_data = ArrayData::builder(data_type)
2271 .len(5)
2272 .offset(0)
2273 .add_child_data(child_arr_data.clone())
2274 .build()
2275 .unwrap();
2276
2277 assert_eq!(5, arr_data.len());
2278 assert_eq!(1, arr_data.child_data().len());
2279 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2280 }
2281
2282 #[test]
2283 fn test_null_count() {
2284 let mut bit_v: [u8; 2] = [0; 2];
2285 bit_util::set_bit(&mut bit_v, 0);
2286 bit_util::set_bit(&mut bit_v, 3);
2287 bit_util::set_bit(&mut bit_v, 10);
2288 let arr_data = ArrayData::builder(DataType::Int32)
2289 .len(16)
2290 .add_buffer(make_i32_buffer(16))
2291 .null_bit_buffer(Some(Buffer::from(bit_v)))
2292 .build()
2293 .unwrap();
2294 assert_eq!(13, arr_data.null_count());
2295
2296 let mut bit_v: [u8; 2] = [0; 2];
2298 bit_util::set_bit(&mut bit_v, 0);
2299 bit_util::set_bit(&mut bit_v, 3);
2300 bit_util::set_bit(&mut bit_v, 10);
2301 let arr_data = ArrayData::builder(DataType::Int32)
2302 .len(12)
2303 .offset(2)
2304 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2306 .build()
2307 .unwrap();
2308 assert_eq!(10, arr_data.null_count());
2309 }
2310
2311 #[test]
2312 fn test_null_buffer_ref() {
2313 let mut bit_v: [u8; 2] = [0; 2];
2314 bit_util::set_bit(&mut bit_v, 0);
2315 bit_util::set_bit(&mut bit_v, 3);
2316 bit_util::set_bit(&mut bit_v, 10);
2317 let arr_data = ArrayData::builder(DataType::Int32)
2318 .len(16)
2319 .add_buffer(make_i32_buffer(16))
2320 .null_bit_buffer(Some(Buffer::from(bit_v)))
2321 .build()
2322 .unwrap();
2323 assert!(arr_data.nulls().is_some());
2324 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2325 }
2326
2327 #[test]
2328 fn test_slice() {
2329 let mut bit_v: [u8; 2] = [0; 2];
2330 bit_util::set_bit(&mut bit_v, 0);
2331 bit_util::set_bit(&mut bit_v, 3);
2332 bit_util::set_bit(&mut bit_v, 10);
2333 let data = ArrayData::builder(DataType::Int32)
2334 .len(16)
2335 .add_buffer(make_i32_buffer(16))
2336 .null_bit_buffer(Some(Buffer::from(bit_v)))
2337 .build()
2338 .unwrap();
2339 let new_data = data.slice(1, 15);
2340 assert_eq!(data.len() - 1, new_data.len());
2341 assert_eq!(1, new_data.offset());
2342 assert_eq!(data.null_count(), new_data.null_count());
2343
2344 let new_data = new_data.slice(1, 14);
2346 assert_eq!(data.len() - 2, new_data.len());
2347 assert_eq!(2, new_data.offset());
2348 assert_eq!(data.null_count() - 1, new_data.null_count());
2349 }
2350
2351 #[test]
2352 fn test_equality() {
2353 let int_data = ArrayData::builder(DataType::Int32)
2354 .len(1)
2355 .add_buffer(make_i32_buffer(1))
2356 .build()
2357 .unwrap();
2358
2359 let float_data = ArrayData::builder(DataType::Float32)
2360 .len(1)
2361 .add_buffer(make_f32_buffer(1))
2362 .build()
2363 .unwrap();
2364 assert_ne!(int_data, float_data);
2365 assert!(!int_data.ptr_eq(&float_data));
2366 assert!(int_data.ptr_eq(&int_data));
2367
2368 #[allow(clippy::redundant_clone)]
2369 let int_data_clone = int_data.clone();
2370 assert_eq!(int_data, int_data_clone);
2371 assert!(int_data.ptr_eq(&int_data_clone));
2372 assert!(int_data_clone.ptr_eq(&int_data));
2373
2374 let int_data_slice = int_data_clone.slice(1, 0);
2375 assert!(int_data_slice.ptr_eq(&int_data_slice));
2376 assert!(!int_data.ptr_eq(&int_data_slice));
2377 assert!(!int_data_slice.ptr_eq(&int_data));
2378
2379 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2380 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2381 let string_data = ArrayData::try_new(
2382 DataType::Utf8,
2383 3,
2384 Some(Buffer::from_iter(vec![true, false, true])),
2385 0,
2386 vec![offsets_buffer, data_buffer],
2387 vec![],
2388 )
2389 .unwrap();
2390
2391 assert_ne!(float_data, string_data);
2392 assert!(!float_data.ptr_eq(&string_data));
2393
2394 assert!(string_data.ptr_eq(&string_data));
2395
2396 #[allow(clippy::redundant_clone)]
2397 let string_data_cloned = string_data.clone();
2398 assert!(string_data_cloned.ptr_eq(&string_data));
2399 assert!(string_data.ptr_eq(&string_data_cloned));
2400
2401 let string_data_slice = string_data.slice(1, 2);
2402 assert!(string_data_slice.ptr_eq(&string_data_slice));
2403 assert!(!string_data_slice.ptr_eq(&string_data))
2404 }
2405
2406 #[test]
2407 fn test_slice_memory_size() {
2408 let mut bit_v: [u8; 2] = [0; 2];
2409 bit_util::set_bit(&mut bit_v, 0);
2410 bit_util::set_bit(&mut bit_v, 3);
2411 bit_util::set_bit(&mut bit_v, 10);
2412 let data = ArrayData::builder(DataType::Int32)
2413 .len(16)
2414 .add_buffer(make_i32_buffer(16))
2415 .null_bit_buffer(Some(Buffer::from(bit_v)))
2416 .build()
2417 .unwrap();
2418 let new_data = data.slice(1, 14);
2419 assert_eq!(
2420 data.get_slice_memory_size().unwrap() - 8,
2421 new_data.get_slice_memory_size().unwrap()
2422 );
2423 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2424 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2425 let string_data = ArrayData::try_new(
2426 DataType::Utf8,
2427 3,
2428 Some(Buffer::from_iter(vec![true, false, true])),
2429 0,
2430 vec![offsets_buffer, data_buffer],
2431 vec![],
2432 )
2433 .unwrap();
2434 let string_data_slice = string_data.slice(1, 2);
2435 assert_eq!(
2437 string_data.get_slice_memory_size().unwrap() - 6,
2438 string_data_slice.get_slice_memory_size().unwrap()
2439 );
2440 }
2441
2442 #[test]
2443 fn test_count_nulls() {
2444 let buffer = Buffer::from([0b00010110, 0b10011111]);
2445 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2446 let count = count_nulls(Some(&buffer), 0, 16);
2447 assert_eq!(count, 7);
2448
2449 let count = count_nulls(Some(&buffer), 4, 8);
2450 assert_eq!(count, 3);
2451 }
2452
2453 #[test]
2454 fn test_contains_nulls() {
2455 let buffer: Buffer =
2456 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2457 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2458 assert!(contains_nulls(Some(&buffer), 0, 6));
2459 assert!(contains_nulls(Some(&buffer), 0, 3));
2460 assert!(!contains_nulls(Some(&buffer), 3, 2));
2461 assert!(!contains_nulls(Some(&buffer), 0, 0));
2462 }
2463
2464 #[test]
2465 fn test_alignment() {
2466 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2467 let sliced = buffer.slice(1);
2468
2469 let mut data = ArrayData {
2470 data_type: DataType::Int32,
2471 len: 0,
2472 offset: 0,
2473 buffers: vec![buffer],
2474 child_data: vec![],
2475 nulls: None,
2476 };
2477 data.validate_full().unwrap();
2478
2479 data.buffers[0] = sliced;
2481 let err = data.validate().unwrap_err();
2482
2483 assert_eq!(
2484 err.to_string(),
2485 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2486 );
2487
2488 data.align_buffers();
2489 data.validate_full().unwrap();
2490 }
2491
2492 #[test]
2493 fn test_alignment_struct() {
2494 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2495 let sliced = buffer.slice(1);
2496
2497 let child_data = ArrayData {
2498 data_type: DataType::Int32,
2499 len: 0,
2500 offset: 0,
2501 buffers: vec![buffer],
2502 child_data: vec![],
2503 nulls: None,
2504 };
2505
2506 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2507 let mut data = ArrayData {
2508 data_type: schema,
2509 len: 0,
2510 offset: 0,
2511 buffers: vec![],
2512 child_data: vec![child_data],
2513 nulls: None,
2514 };
2515 data.validate_full().unwrap();
2516
2517 data.child_data[0].buffers[0] = sliced;
2519 let err = data.validate().unwrap_err();
2520
2521 assert_eq!(
2522 err.to_string(),
2523 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2524 );
2525
2526 data.align_buffers();
2527 data.validate_full().unwrap();
2528 }
2529
2530 #[test]
2531 fn test_null_view_types() {
2532 let array_len = 32;
2533 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2534 assert_eq!(array.len(), array_len);
2535 for i in 0..array.len() {
2536 assert!(array.is_null(i));
2537 }
2538
2539 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2540 assert_eq!(array.len(), array_len);
2541 for i in 0..array.len() {
2542 assert!(array.is_null(i));
2543 }
2544
2545 let array = ArrayData::new_null(
2546 &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2547 array_len,
2548 );
2549 assert_eq!(array.len(), array_len);
2550 for i in 0..array.len() {
2551 assert!(array.is_null(i));
2552 }
2553
2554 let array = ArrayData::new_null(
2555 &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2556 array_len,
2557 );
2558 assert_eq!(array.len(), array_len);
2559 for i in 0..array.len() {
2560 assert!(array.is_null(i));
2561 }
2562 }
2563}