1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 unsafe { skip_validation.set(true) };
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
316 data_type: DataType,
317 len: usize,
318 null_bit_buffer: Option<Buffer>,
319 offset: usize,
320 buffers: Vec<Buffer>,
321 child_data: Vec<ArrayData>,
322 ) -> Result<Self, ArrowError> {
323 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
327 let needed_len = bit_util::ceil(len + offset, 8);
328 if null_bit_buffer.len() < needed_len {
329 return Err(ArrowError::InvalidArgumentError(format!(
330 "null_bit_buffer size too small. got {} needed {}",
331 null_bit_buffer.len(),
332 needed_len
333 )));
334 }
335 }
336 let new_self = unsafe {
338 Self::new_unchecked(
339 data_type,
340 len,
341 None,
342 null_bit_buffer,
343 offset,
344 buffers,
345 child_data,
346 )
347 };
348
349 new_self.validate_data()?;
354 Ok(new_self)
355 }
356
357 pub fn into_parts(
363 self,
364 ) -> (
365 DataType,
366 usize,
367 Option<NullBuffer>,
368 usize,
369 Vec<Buffer>,
370 Vec<ArrayData>,
371 ) {
372 let Self {
373 data_type,
374 len,
375 nulls,
376 offset,
377 buffers,
378 child_data,
379 } = self;
380
381 (data_type, len, nulls, offset, buffers, child_data)
382 }
383
384 #[inline]
386 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
387 ArrayDataBuilder::new(data_type)
388 }
389
390 #[inline]
392 pub const fn data_type(&self) -> &DataType {
393 &self.data_type
394 }
395
396 pub fn buffers(&self) -> &[Buffer] {
398 &self.buffers
399 }
400
401 pub fn child_data(&self) -> &[ArrayData] {
404 &self.child_data[..]
405 }
406
407 #[inline]
409 pub fn is_null(&self, i: usize) -> bool {
410 match &self.nulls {
411 Some(v) => v.is_null(i),
412 None => false,
413 }
414 }
415
416 #[inline]
420 pub fn nulls(&self) -> Option<&NullBuffer> {
421 self.nulls.as_ref()
422 }
423
424 #[inline]
426 pub fn is_valid(&self, i: usize) -> bool {
427 !self.is_null(i)
428 }
429
430 #[inline]
432 pub const fn len(&self) -> usize {
433 self.len
434 }
435
436 #[inline]
438 pub const fn is_empty(&self) -> bool {
439 self.len == 0
440 }
441
442 #[inline]
444 pub const fn offset(&self) -> usize {
445 self.offset
446 }
447
448 #[inline]
450 pub fn null_count(&self) -> usize {
451 self.nulls
452 .as_ref()
453 .map(|x| x.null_count())
454 .unwrap_or_default()
455 }
456
457 pub fn get_buffer_memory_size(&self) -> usize {
469 let mut size = 0;
470 for buffer in &self.buffers {
471 size += buffer.capacity();
472 }
473 if let Some(bitmap) = &self.nulls {
474 size += bitmap.buffer().capacity()
475 }
476 for child in &self.child_data {
477 size += child.get_buffer_memory_size();
478 }
479 size
480 }
481
482 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
495 let mut result: usize = 0;
496 let layout = layout(&self.data_type);
497
498 for spec in layout.buffers.iter() {
499 match spec {
500 BufferSpec::FixedWidth { byte_width, .. } => {
501 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
502 ArrowError::ComputeError(
503 "Integer overflow computing buffer size".to_string(),
504 )
505 })?;
506 result += buffer_size;
507 }
508 BufferSpec::VariableWidth => {
509 let buffer_len = match self.data_type {
510 DataType::Utf8 | DataType::Binary => {
511 let offsets = self.typed_offsets::<i32>()?;
512 (offsets[self.len] - offsets[0]) as usize
513 }
514 DataType::LargeUtf8 | DataType::LargeBinary => {
515 let offsets = self.typed_offsets::<i64>()?;
516 (offsets[self.len] - offsets[0]) as usize
517 }
518 _ => {
519 return Err(ArrowError::NotYetImplemented(format!(
520 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
521 self.data_type
522 )));
523 }
524 };
525 result += buffer_len;
526 }
527 BufferSpec::BitMap => {
528 let buffer_size = bit_util::ceil(self.len, 8);
529 result += buffer_size;
530 }
531 BufferSpec::AlwaysNull => {
532 }
534 }
535 }
536
537 if self.nulls().is_some() {
538 result += bit_util::ceil(self.len, 8);
539 }
540
541 for child in &self.child_data {
542 result += child.get_slice_memory_size()?;
543 }
544 Ok(result)
545 }
546
547 pub fn get_array_memory_size(&self) -> usize {
556 let mut size = mem::size_of_val(self);
557
558 for buffer in &self.buffers {
560 size += mem::size_of::<Buffer>();
561 size += buffer.capacity();
562 }
563 if let Some(nulls) = &self.nulls {
564 size += nulls.buffer().capacity();
565 }
566 for child in &self.child_data {
567 size += child.get_array_memory_size();
568 }
569
570 size
571 }
572
573 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
581 assert!((offset + length) <= self.len());
582
583 if let DataType::Struct(_) = self.data_type() {
584 let new_offset = self.offset + offset;
586 ArrayData {
587 data_type: self.data_type().clone(),
588 len: length,
589 offset: new_offset,
590 buffers: self.buffers.clone(),
591 child_data: self
593 .child_data()
594 .iter()
595 .map(|data| data.slice(offset, length))
596 .collect(),
597 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
598 }
599 } else {
600 let mut new_data = self.clone();
601
602 new_data.len = length;
603 new_data.offset = offset + self.offset;
604 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
605
606 new_data
607 }
608 }
609
610 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
617 &self.buffers()[buffer].typed_data()[self.offset..]
618 }
619
620 pub fn new_null(data_type: &DataType, len: usize) -> Self {
622 let bit_len = bit_util::ceil(len, 8);
623 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
624
625 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
626 Some(width) => (vec![zeroed(width * len)], vec![], true),
627 None => match data_type {
628 DataType::Null => (vec![], vec![], false),
629 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
630 DataType::Binary | DataType::Utf8 => {
631 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
632 }
633 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
634 DataType::LargeBinary | DataType::LargeUtf8 => {
635 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
636 }
637 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
638 DataType::List(f) | DataType::Map(f, _) => (
639 vec![zeroed((len + 1) * 4)],
640 vec![ArrayData::new_empty(f.data_type())],
641 true,
642 ),
643 DataType::LargeList(f) => (
644 vec![zeroed((len + 1) * 8)],
645 vec![ArrayData::new_empty(f.data_type())],
646 true,
647 ),
648 DataType::ListView(f) => (
649 vec![zeroed(len * 4), zeroed(len * 4)],
650 vec![ArrayData::new_empty(f.data_type())],
651 true,
652 ),
653 DataType::LargeListView(f) => (
654 vec![zeroed(len * 8), zeroed(len * 8)],
655 vec![ArrayData::new_empty(f.data_type())],
656 true,
657 ),
658 DataType::FixedSizeList(f, list_len) => (
659 vec![],
660 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
661 true,
662 ),
663 DataType::Struct(fields) => (
664 vec![],
665 fields
666 .iter()
667 .map(|f| Self::new_null(f.data_type(), len))
668 .collect(),
669 true,
670 ),
671 DataType::Dictionary(k, v) => (
672 vec![zeroed(k.primitive_width().unwrap() * len)],
673 vec![ArrayData::new_empty(v.as_ref())],
674 true,
675 ),
676 DataType::Union(f, mode) => {
677 let (id, _) = f.iter().next().unwrap();
678 let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
679 let buffers = match mode {
680 UnionMode::Sparse => vec![ids],
681 UnionMode::Dense => {
682 let end_offset = i32::from_usize(len).unwrap();
683 vec![ids, Buffer::from_iter(0_i32..end_offset)]
684 }
685 };
686
687 let children = f
688 .iter()
689 .enumerate()
690 .map(|(idx, (_, f))| {
691 if idx == 0 || *mode == UnionMode::Sparse {
692 Self::new_null(f.data_type(), len)
693 } else {
694 Self::new_empty(f.data_type())
695 }
696 })
697 .collect();
698
699 (buffers, children, false)
700 }
701 DataType::RunEndEncoded(r, v) => {
702 if len == 0 {
703 let runs = ArrayData::new_empty(r.data_type());
705 let values = ArrayData::new_empty(v.data_type());
706 (vec![], vec![runs, values], false)
707 } else {
708 let runs = match r.data_type() {
709 DataType::Int16 => {
710 let i = i16::from_usize(len).expect("run overflow");
711 Buffer::from_slice_ref([i])
712 }
713 DataType::Int32 => {
714 let i = i32::from_usize(len).expect("run overflow");
715 Buffer::from_slice_ref([i])
716 }
717 DataType::Int64 => {
718 let i = i64::from_usize(len).expect("run overflow");
719 Buffer::from_slice_ref([i])
720 }
721 dt => unreachable!("Invalid run ends data type {dt}"),
722 };
723
724 let builder = ArrayData::builder(r.data_type().clone())
725 .len(1)
726 .buffers(vec![runs]);
727
728 let runs = unsafe { builder.build_unchecked() };
731 (
732 vec![],
733 vec![runs, ArrayData::new_null(v.data_type(), 1)],
734 false,
735 )
736 }
737 }
738 DataType::Int8
740 | DataType::Int16
741 | DataType::Int32
742 | DataType::Int64
743 | DataType::UInt8
744 | DataType::UInt16
745 | DataType::UInt32
746 | DataType::UInt64
747 | DataType::Float16
748 | DataType::Float32
749 | DataType::Float64
750 | DataType::Timestamp(_, _)
751 | DataType::Date32
752 | DataType::Date64
753 | DataType::Time32(_)
754 | DataType::Time64(_)
755 | DataType::Duration(_)
756 | DataType::Interval(_)
757 | DataType::Decimal32(_, _)
758 | DataType::Decimal64(_, _)
759 | DataType::Decimal128(_, _)
760 | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
761 },
762 };
763
764 let mut builder = ArrayDataBuilder::new(data_type.clone())
765 .len(len)
766 .buffers(buffers)
767 .child_data(child_data);
768
769 if has_nulls {
770 builder = builder.nulls(Some(NullBuffer::new_null(len)))
771 }
772
773 unsafe { builder.build_unchecked() }
776 }
777
778 pub fn new_empty(data_type: &DataType) -> Self {
780 Self::new_null(data_type, 0)
781 }
782
783 pub fn align_buffers(&mut self) {
792 let layout = layout(&self.data_type);
793 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
794 if let BufferSpec::FixedWidth { alignment, .. } = spec {
795 if buffer.as_ptr().align_offset(*alignment) != 0 {
796 *buffer = Buffer::from_slice_ref(buffer.as_ref());
797 }
798 }
799 }
800 for data in self.child_data.iter_mut() {
802 data.align_buffers()
803 }
804 }
805
806 pub fn validate(&self) -> Result<(), ArrowError> {
817 let len_plus_offset = self.len + self.offset;
819
820 let layout = layout(&self.data_type);
822
823 if !layout.can_contain_null_mask && self.nulls.is_some() {
824 return Err(ArrowError::InvalidArgumentError(format!(
825 "Arrays of type {:?} cannot contain a null bitmask",
826 self.data_type,
827 )));
828 }
829
830 if self.buffers.len() < layout.buffers.len()
832 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
833 {
834 return Err(ArrowError::InvalidArgumentError(format!(
835 "Expected {} buffers in array of type {:?}, got {}",
836 layout.buffers.len(),
837 self.data_type,
838 self.buffers.len(),
839 )));
840 }
841
842 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
843 match spec {
844 BufferSpec::FixedWidth {
845 byte_width,
846 alignment,
847 } => {
848 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
849
850 if buffer.len() < min_buffer_size {
851 return Err(ArrowError::InvalidArgumentError(format!(
852 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
853 min_buffer_size,
854 i,
855 self.data_type,
856 buffer.len()
857 )));
858 }
859
860 let align_offset = buffer.as_ptr().align_offset(*alignment);
861 if align_offset != 0 {
862 return Err(ArrowError::InvalidArgumentError(format!(
863 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
864 self.data_type,
865 align_offset.min(alignment - align_offset)
866 )));
867 }
868 }
869 BufferSpec::VariableWidth => {
870 }
874 BufferSpec::BitMap => {
875 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
876 if buffer.len() < min_buffer_size {
877 return Err(ArrowError::InvalidArgumentError(format!(
878 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
879 min_buffer_size,
880 i,
881 self.data_type,
882 buffer.len()
883 )));
884 }
885 }
886 BufferSpec::AlwaysNull => {
887 }
889 }
890 }
891
892 if let Some(nulls) = self.nulls() {
894 if nulls.null_count() > self.len {
895 return Err(ArrowError::InvalidArgumentError(format!(
896 "null_count {} for an array exceeds length of {} elements",
897 nulls.null_count(),
898 self.len
899 )));
900 }
901
902 let actual_len = nulls.validity().len();
903 let needed_len = bit_util::ceil(len_plus_offset, 8);
904 if actual_len < needed_len {
905 return Err(ArrowError::InvalidArgumentError(format!(
906 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
907 )));
908 }
909
910 if nulls.len() != self.len {
911 return Err(ArrowError::InvalidArgumentError(format!(
912 "null buffer incorrect size. got {} expected {}",
913 nulls.len(),
914 self.len
915 )));
916 }
917 }
918
919 self.validate_child_data()?;
920
921 match &self.data_type {
923 DataType::Utf8 | DataType::Binary => {
924 self.validate_offsets::<i32>(self.buffers[1].len())?;
925 }
926 DataType::LargeUtf8 | DataType::LargeBinary => {
927 self.validate_offsets::<i64>(self.buffers[1].len())?;
928 }
929 DataType::Dictionary(key_type, _value_type) => {
930 if !DataType::is_dictionary_key_type(key_type) {
932 return Err(ArrowError::InvalidArgumentError(format!(
933 "Dictionary key type must be integer, but was {key_type}"
934 )));
935 }
936 }
937 DataType::RunEndEncoded(run_ends_type, _) => {
938 if run_ends_type.is_nullable() {
939 return Err(ArrowError::InvalidArgumentError(
940 "The nullable should be set to false for the field defining run_ends array.".to_string()
941 ));
942 }
943 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
944 return Err(ArrowError::InvalidArgumentError(format!(
945 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
946 run_ends_type.data_type()
947 )));
948 }
949 }
950 _ => {}
951 };
952
953 Ok(())
954 }
955
956 fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
963 if self.len == 0 && self.buffers[0].is_empty() {
965 return Ok(&[]);
966 }
967
968 self.typed_buffer(0, self.len + 1)
969 }
970
971 fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
973 &self,
974 idx: usize,
975 len: usize,
976 ) -> Result<&[T], ArrowError> {
977 let buffer = &self.buffers[idx];
978
979 let required_len = (len + self.offset) * mem::size_of::<T>();
980
981 if buffer.len() < required_len {
982 return Err(ArrowError::InvalidArgumentError(format!(
983 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
984 idx,
985 self.data_type,
986 required_len,
987 buffer.len()
988 )));
989 }
990
991 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
992 }
993
994 fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
997 &self,
998 values_length: usize,
999 ) -> Result<(), ArrowError> {
1000 let offsets = self.typed_offsets::<T>()?;
1002 if offsets.is_empty() {
1003 return Ok(());
1004 }
1005
1006 let first_offset = offsets[0].to_usize().ok_or_else(|| {
1007 ArrowError::InvalidArgumentError(format!(
1008 "Error converting offset[0] ({}) to usize for {}",
1009 offsets[0], self.data_type
1010 ))
1011 })?;
1012
1013 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1014 ArrowError::InvalidArgumentError(format!(
1015 "Error converting offset[{}] ({}) to usize for {}",
1016 self.len, offsets[self.len], self.data_type
1017 ))
1018 })?;
1019
1020 if first_offset > values_length {
1021 return Err(ArrowError::InvalidArgumentError(format!(
1022 "First offset {} of {} is larger than values length {}",
1023 first_offset, self.data_type, values_length,
1024 )));
1025 }
1026
1027 if last_offset > values_length {
1028 return Err(ArrowError::InvalidArgumentError(format!(
1029 "Last offset {} of {} is larger than values length {}",
1030 last_offset, self.data_type, values_length,
1031 )));
1032 }
1033
1034 if first_offset > last_offset {
1035 return Err(ArrowError::InvalidArgumentError(format!(
1036 "First offset {} in {} is smaller than last offset {}",
1037 first_offset, self.data_type, last_offset,
1038 )));
1039 }
1040
1041 Ok(())
1042 }
1043
1044 fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1047 &self,
1048 values_length: usize,
1049 ) -> Result<(), ArrowError> {
1050 let offsets: &[T] = self.typed_buffer(0, self.len)?;
1051 let sizes: &[T] = self.typed_buffer(1, self.len)?;
1052 if offsets.len() != sizes.len() {
1053 return Err(ArrowError::ComputeError(format!(
1054 "ListView offsets len {} does not match sizes len {}",
1055 offsets.len(),
1056 sizes.len()
1057 )));
1058 }
1059
1060 for i in 0..sizes.len() {
1061 let size = sizes[i].to_usize().ok_or_else(|| {
1062 ArrowError::InvalidArgumentError(format!(
1063 "Error converting size[{}] ({}) to usize for {}",
1064 i, sizes[i], self.data_type
1065 ))
1066 })?;
1067 let offset = offsets[i].to_usize().ok_or_else(|| {
1068 ArrowError::InvalidArgumentError(format!(
1069 "Error converting offset[{}] ({}) to usize for {}",
1070 i, offsets[i], self.data_type
1071 ))
1072 })?;
1073 if size
1074 .checked_add(offset)
1075 .expect("Offset and size have exceeded the usize boundary")
1076 > values_length
1077 {
1078 return Err(ArrowError::InvalidArgumentError(format!(
1079 "Size {} at index {} is larger than the remaining values for {}",
1080 size, i, self.data_type
1081 )));
1082 }
1083 }
1084 Ok(())
1085 }
1086
1087 fn validate_child_data(&self) -> Result<(), ArrowError> {
1089 match &self.data_type {
1090 DataType::List(field) | DataType::Map(field, _) => {
1091 let values_data = self.get_single_valid_child_data(field.data_type())?;
1092 self.validate_offsets::<i32>(values_data.len)?;
1093 Ok(())
1094 }
1095 DataType::LargeList(field) => {
1096 let values_data = self.get_single_valid_child_data(field.data_type())?;
1097 self.validate_offsets::<i64>(values_data.len)?;
1098 Ok(())
1099 }
1100 DataType::ListView(field) => {
1101 let values_data = self.get_single_valid_child_data(field.data_type())?;
1102 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1103 Ok(())
1104 }
1105 DataType::LargeListView(field) => {
1106 let values_data = self.get_single_valid_child_data(field.data_type())?;
1107 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1108 Ok(())
1109 }
1110 DataType::FixedSizeList(field, list_size) => {
1111 let values_data = self.get_single_valid_child_data(field.data_type())?;
1112
1113 let list_size: usize = (*list_size).try_into().map_err(|_| {
1114 ArrowError::InvalidArgumentError(format!(
1115 "{} has a negative list_size {}",
1116 self.data_type, list_size
1117 ))
1118 })?;
1119
1120 let expected_values_len = self.len
1121 .checked_mul(list_size)
1122 .expect("integer overflow computing expected number of expected values in FixedListSize");
1123
1124 if values_data.len < expected_values_len {
1125 return Err(ArrowError::InvalidArgumentError(format!(
1126 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1127 values_data.len, self.len, list_size, self.data_type
1128 )));
1129 }
1130
1131 Ok(())
1132 }
1133 DataType::Struct(fields) => {
1134 self.validate_num_child_data(fields.len())?;
1135 for (i, field) in fields.iter().enumerate() {
1136 let field_data = self.get_valid_child_data(i, field.data_type())?;
1137
1138 if field_data.len < self.len {
1140 return Err(ArrowError::InvalidArgumentError(format!(
1141 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1142 self.data_type,
1143 i,
1144 field.name(),
1145 field_data.len,
1146 self.len
1147 )));
1148 }
1149 }
1150 Ok(())
1151 }
1152 DataType::RunEndEncoded(run_ends_field, values_field) => {
1153 self.validate_num_child_data(2)?;
1154 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1155 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1156 if run_ends_data.len != values_data.len {
1157 return Err(ArrowError::InvalidArgumentError(format!(
1158 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1159 run_ends_data.len, values_data.len
1160 )));
1161 }
1162 if run_ends_data.nulls.is_some() {
1163 return Err(ArrowError::InvalidArgumentError(
1164 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1165 ));
1166 }
1167 Ok(())
1168 }
1169 DataType::Union(fields, mode) => {
1170 self.validate_num_child_data(fields.len())?;
1171
1172 for (i, (_, field)) in fields.iter().enumerate() {
1173 let field_data = self.get_valid_child_data(i, field.data_type())?;
1174
1175 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1176 return Err(ArrowError::InvalidArgumentError(format!(
1177 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1178 i,
1179 field_data.len,
1180 self.len + self.offset
1181 )));
1182 }
1183 }
1184 Ok(())
1185 }
1186 DataType::Dictionary(_key_type, value_type) => {
1187 self.get_single_valid_child_data(value_type)?;
1188 Ok(())
1189 }
1190 _ => {
1191 if !self.child_data.is_empty() {
1193 return Err(ArrowError::InvalidArgumentError(format!(
1194 "Expected no child arrays for type {} but got {}",
1195 self.data_type,
1196 self.child_data.len()
1197 )));
1198 }
1199 Ok(())
1200 }
1201 }
1202 }
1203
1204 fn get_single_valid_child_data(
1208 &self,
1209 expected_type: &DataType,
1210 ) -> Result<&ArrayData, ArrowError> {
1211 self.validate_num_child_data(1)?;
1212 self.get_valid_child_data(0, expected_type)
1213 }
1214
1215 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1217 if self.child_data.len() != expected_len {
1218 Err(ArrowError::InvalidArgumentError(format!(
1219 "Value data for {} should contain {} child data array(s), had {}",
1220 self.data_type,
1221 expected_len,
1222 self.child_data.len()
1223 )))
1224 } else {
1225 Ok(())
1226 }
1227 }
1228
1229 fn get_valid_child_data(
1232 &self,
1233 i: usize,
1234 expected_type: &DataType,
1235 ) -> Result<&ArrayData, ArrowError> {
1236 let values_data = self.child_data.get(i).ok_or_else(|| {
1237 ArrowError::InvalidArgumentError(format!(
1238 "{} did not have enough child arrays. Expected at least {} but had only {}",
1239 self.data_type,
1240 i + 1,
1241 self.child_data.len()
1242 ))
1243 })?;
1244
1245 if expected_type != &values_data.data_type {
1246 return Err(ArrowError::InvalidArgumentError(format!(
1247 "Child type mismatch for {}. Expected {} but child data had {}",
1248 self.data_type, expected_type, values_data.data_type
1249 )));
1250 }
1251
1252 values_data.validate()?;
1253 Ok(values_data)
1254 }
1255
1256 pub fn validate_data(&self) -> Result<(), ArrowError> {
1272 self.validate()?;
1273
1274 self.validate_nulls()?;
1275 self.validate_values()?;
1276 Ok(())
1277 }
1278
1279 pub fn validate_full(&self) -> Result<(), ArrowError> {
1284 self.validate_data()?;
1285 self.child_data
1287 .iter()
1288 .enumerate()
1289 .try_for_each(|(i, child_data)| {
1290 child_data.validate_full().map_err(|e| {
1291 ArrowError::InvalidArgumentError(format!(
1292 "{} child #{} invalid: {}",
1293 self.data_type, i, e
1294 ))
1295 })
1296 })?;
1297 Ok(())
1298 }
1299
1300 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1310 if let Some(nulls) = &self.nulls {
1311 let actual = nulls.len() - nulls.inner().count_set_bits();
1312 if actual != nulls.null_count() {
1313 return Err(ArrowError::InvalidArgumentError(format!(
1314 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1315 nulls.null_count(),
1316 actual
1317 )));
1318 }
1319 }
1320
1321 match &self.data_type {
1326 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1327 if !f.is_nullable() {
1328 self.validate_non_nullable(None, &self.child_data[0])?
1329 }
1330 }
1331 DataType::FixedSizeList(field, len) => {
1332 let child = &self.child_data[0];
1333 if !field.is_nullable() {
1334 match &self.nulls {
1335 Some(nulls) => {
1336 let element_len = *len as usize;
1337 let expanded = nulls.expand(element_len);
1338 self.validate_non_nullable(Some(&expanded), child)?;
1339 }
1340 None => self.validate_non_nullable(None, child)?,
1341 }
1342 }
1343 }
1344 DataType::Struct(fields) => {
1345 for (field, child) in fields.iter().zip(&self.child_data) {
1346 if !field.is_nullable() {
1347 self.validate_non_nullable(self.nulls(), child)?
1348 }
1349 }
1350 }
1351 _ => {}
1352 }
1353
1354 Ok(())
1355 }
1356
1357 fn validate_non_nullable(
1359 &self,
1360 mask: Option<&NullBuffer>,
1361 child: &ArrayData,
1362 ) -> Result<(), ArrowError> {
1363 let mask = match mask {
1364 Some(mask) => mask,
1365 None => {
1366 return match child.null_count() {
1367 0 => Ok(()),
1368 _ => Err(ArrowError::InvalidArgumentError(format!(
1369 "non-nullable child of type {} contains nulls not present in parent {}",
1370 child.data_type, self.data_type
1371 ))),
1372 };
1373 }
1374 };
1375
1376 match child.nulls() {
1377 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1378 "non-nullable child of type {} contains nulls not present in parent",
1379 child.data_type
1380 ))),
1381 _ => Ok(()),
1382 }
1383 }
1384
1385 pub fn validate_values(&self) -> Result<(), ArrowError> {
1391 match &self.data_type {
1392 DataType::Utf8 => self.validate_utf8::<i32>(),
1393 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1394 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1395 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1396 DataType::BinaryView => {
1397 let views = self.typed_buffer::<u128>(0, self.len)?;
1398 validate_binary_view(views, &self.buffers[1..])
1399 }
1400 DataType::Utf8View => {
1401 let views = self.typed_buffer::<u128>(0, self.len)?;
1402 validate_string_view(views, &self.buffers[1..])
1403 }
1404 DataType::List(_) | DataType::Map(_, _) => {
1405 let child = &self.child_data[0];
1406 self.validate_offsets_full::<i32>(child.len)
1407 }
1408 DataType::LargeList(_) => {
1409 let child = &self.child_data[0];
1410 self.validate_offsets_full::<i64>(child.len)
1411 }
1412 DataType::Union(_, _) => {
1413 Ok(())
1419 }
1420 DataType::Dictionary(key_type, _value_type) => {
1421 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1422 let max_value = dictionary_length - 1;
1423 match key_type.as_ref() {
1424 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1425 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1426 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1427 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1428 DataType::Int8 => self.check_bounds::<i8>(max_value),
1429 DataType::Int16 => self.check_bounds::<i16>(max_value),
1430 DataType::Int32 => self.check_bounds::<i32>(max_value),
1431 DataType::Int64 => self.check_bounds::<i64>(max_value),
1432 _ => unreachable!(),
1433 }
1434 }
1435 DataType::RunEndEncoded(run_ends, _values) => {
1436 let run_ends_data = self.child_data()[0].clone();
1437 match run_ends.data_type() {
1438 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1439 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1440 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1441 _ => unreachable!(),
1442 }
1443 }
1444 _ => {
1445 Ok(())
1447 }
1448 }
1449 }
1450
1451 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1462 where
1463 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1464 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1465 {
1466 self.typed_offsets::<T>()?
1467 .iter()
1468 .enumerate()
1469 .map(|(i, x)| {
1470 let r = x.to_usize().ok_or_else(|| {
1472 ArrowError::InvalidArgumentError(format!(
1473 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1474 );
1475 match r {
1477 Ok(n) if n <= offset_limit => Ok((i, n)),
1478 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1479 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1480 ),
1481 Err(e) => Err(e),
1482 }
1483 })
1484 .scan(0_usize, |start, end| {
1485 match end {
1487 Ok((i, end)) if *start <= end => {
1488 let range = Some(Ok((i, *start..end)));
1489 *start = end;
1490 range
1491 }
1492 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1493 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1494 i - 1, start, end))
1495 )),
1496 Err(err) => Some(Err(err)),
1497 }
1498 })
1499 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1501 let (item_index, range) = res?;
1502 validate(item_index-1, range)
1503 })
1504 }
1505
1506 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1509 where
1510 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1511 {
1512 let values_buffer = &self.buffers[1].as_slice();
1513 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1514 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1516 if !values_str.is_char_boundary(range.start)
1517 || !values_str.is_char_boundary(range.end)
1518 {
1519 return Err(ArrowError::InvalidArgumentError(format!(
1520 "incomplete utf-8 byte sequence from index {string_index}"
1521 )));
1522 }
1523 Ok(())
1524 })
1525 } else {
1526 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1528 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1529 ArrowError::InvalidArgumentError(format!(
1530 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1531 ))
1532 })?;
1533 Ok(())
1534 })
1535 }
1536 }
1537
1538 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1541 where
1542 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1543 {
1544 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1545 Ok(())
1548 })
1549 }
1550
1551 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1554 where
1555 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1556 {
1557 let required_len = self.len + self.offset;
1558 let buffer = &self.buffers[0];
1559
1560 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1563
1564 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1566
1567 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1568 if self.is_null(i) {
1570 return Ok(());
1571 }
1572 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1573 ArrowError::InvalidArgumentError(format!(
1574 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1575 ))
1576 })?;
1577
1578 if dict_index < 0 || dict_index > max_value {
1579 return Err(ArrowError::InvalidArgumentError(format!(
1580 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1581 )));
1582 }
1583 Ok(())
1584 })
1585 }
1586
1587 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1589 where
1590 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1591 {
1592 let values = self.typed_buffer::<T>(0, self.len)?;
1593 let mut prev_value: i64 = 0_i64;
1594 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1595 let value: i64 = inp_value.try_into().map_err(|_| {
1596 ArrowError::InvalidArgumentError(format!(
1597 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1598 ))
1599 })?;
1600 if value <= 0_i64 {
1601 return Err(ArrowError::InvalidArgumentError(format!(
1602 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1603 )));
1604 }
1605 if ix > 0 && value <= prev_value {
1606 return Err(ArrowError::InvalidArgumentError(format!(
1607 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1608 )));
1609 }
1610
1611 prev_value = value;
1612 Ok(())
1613 })?;
1614
1615 if prev_value.as_usize() < (self.offset + self.len) {
1616 return Err(ArrowError::InvalidArgumentError(format!(
1617 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1618 self.offset + self.len
1619 )));
1620 }
1621 Ok(())
1622 }
1623
1624 pub fn ptr_eq(&self, other: &Self) -> bool {
1628 if self.offset != other.offset
1629 || self.len != other.len
1630 || self.data_type != other.data_type
1631 || self.buffers.len() != other.buffers.len()
1632 || self.child_data.len() != other.child_data.len()
1633 {
1634 return false;
1635 }
1636
1637 match (&self.nulls, &other.nulls) {
1638 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1639 (Some(_), None) | (None, Some(_)) => return false,
1640 _ => {}
1641 };
1642
1643 if !self
1644 .buffers
1645 .iter()
1646 .zip(other.buffers.iter())
1647 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1648 {
1649 return false;
1650 }
1651
1652 self.child_data
1653 .iter()
1654 .zip(other.child_data.iter())
1655 .all(|(a, b)| a.ptr_eq(b))
1656 }
1657
1658 pub fn into_builder(self) -> ArrayDataBuilder {
1660 self.into()
1661 }
1662
1663 #[cfg(feature = "pool")]
1670 pub fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
1671 for buffer in &self.buffers {
1673 buffer.claim(pool);
1674 }
1675
1676 if let Some(nulls) = &self.nulls {
1678 nulls.claim(pool);
1679 }
1680
1681 for child in &self.child_data {
1683 child.claim(pool);
1684 }
1685 }
1686}
1687
1688pub fn layout(data_type: &DataType) -> DataTypeLayout {
1691 use arrow_schema::IntervalUnit::*;
1694
1695 match data_type {
1696 DataType::Null => DataTypeLayout {
1697 buffers: vec![],
1698 can_contain_null_mask: false,
1699 variadic: false,
1700 },
1701 DataType::Boolean => DataTypeLayout {
1702 buffers: vec![BufferSpec::BitMap],
1703 can_contain_null_mask: true,
1704 variadic: false,
1705 },
1706 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1707 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1708 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1709 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1710 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1711 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1712 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1713 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1714 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1715 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1716 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1717 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1718 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1719 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1720 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1721 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1722 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1723 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1724 DataType::Interval(MonthDayNano) => {
1725 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1726 }
1727 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1728 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1729 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1730 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1731 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1732 DataType::FixedSizeBinary(size) => {
1733 let spec = BufferSpec::FixedWidth {
1734 byte_width: (*size).try_into().unwrap(),
1735 alignment: mem::align_of::<u8>(),
1736 };
1737 DataTypeLayout {
1738 buffers: vec![spec],
1739 can_contain_null_mask: true,
1740 variadic: false,
1741 }
1742 }
1743 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1744 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1745 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1746 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1747 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1748 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1750 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1751 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1752 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1753 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1754 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1757 let type_ids = BufferSpec::FixedWidth {
1758 byte_width: mem::size_of::<i8>(),
1759 alignment: mem::align_of::<i8>(),
1760 };
1761
1762 DataTypeLayout {
1763 buffers: match mode {
1764 UnionMode::Sparse => {
1765 vec![type_ids]
1766 }
1767 UnionMode::Dense => {
1768 vec![
1769 type_ids,
1770 BufferSpec::FixedWidth {
1771 byte_width: mem::size_of::<i32>(),
1772 alignment: mem::align_of::<i32>(),
1773 },
1774 ]
1775 }
1776 },
1777 can_contain_null_mask: false,
1778 variadic: false,
1779 }
1780 }
1781 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1782 }
1783}
1784
1785#[derive(Debug, PartialEq, Eq)]
1787pub struct DataTypeLayout {
1789 pub buffers: Vec<BufferSpec>,
1791
1792 pub can_contain_null_mask: bool,
1794
1795 pub variadic: bool,
1799}
1800
1801impl DataTypeLayout {
1802 pub fn new_fixed_width<T>() -> Self {
1804 Self {
1805 buffers: vec![BufferSpec::FixedWidth {
1806 byte_width: mem::size_of::<T>(),
1807 alignment: mem::align_of::<T>(),
1808 }],
1809 can_contain_null_mask: true,
1810 variadic: false,
1811 }
1812 }
1813
1814 pub fn new_nullable_empty() -> Self {
1817 Self {
1818 buffers: vec![],
1819 can_contain_null_mask: true,
1820 variadic: false,
1821 }
1822 }
1823
1824 pub fn new_empty() -> Self {
1827 Self {
1828 buffers: vec![],
1829 can_contain_null_mask: false,
1830 variadic: false,
1831 }
1832 }
1833
1834 pub fn new_binary<T>() -> Self {
1838 Self {
1839 buffers: vec![
1840 BufferSpec::FixedWidth {
1842 byte_width: mem::size_of::<T>(),
1843 alignment: mem::align_of::<T>(),
1844 },
1845 BufferSpec::VariableWidth,
1847 ],
1848 can_contain_null_mask: true,
1849 variadic: false,
1850 }
1851 }
1852
1853 pub fn new_view() -> Self {
1855 Self {
1856 buffers: vec![BufferSpec::FixedWidth {
1857 byte_width: mem::size_of::<u128>(),
1858 alignment: mem::align_of::<u128>(),
1859 }],
1860 can_contain_null_mask: true,
1861 variadic: true,
1862 }
1863 }
1864
1865 pub fn new_list_view<T>() -> Self {
1867 Self {
1868 buffers: vec![
1869 BufferSpec::FixedWidth {
1870 byte_width: mem::size_of::<T>(),
1871 alignment: mem::align_of::<T>(),
1872 },
1873 BufferSpec::FixedWidth {
1874 byte_width: mem::size_of::<T>(),
1875 alignment: mem::align_of::<T>(),
1876 },
1877 ],
1878 can_contain_null_mask: true,
1879 variadic: false,
1880 }
1881 }
1882}
1883
1884#[derive(Debug, PartialEq, Eq)]
1886pub enum BufferSpec {
1887 FixedWidth {
1898 byte_width: usize,
1900 alignment: usize,
1902 },
1903 VariableWidth,
1905 BitMap,
1911 #[allow(dead_code)]
1914 AlwaysNull,
1915}
1916
1917impl PartialEq for ArrayData {
1918 fn eq(&self, other: &Self) -> bool {
1919 equal::equal(self, other)
1920 }
1921}
1922
1923#[derive(Debug, Clone)]
1942#[doc(hidden)]
1943pub struct UnsafeFlag(bool);
1944
1945impl UnsafeFlag {
1946 #[inline]
1950 pub const fn new() -> Self {
1951 Self(false)
1952 }
1953
1954 #[inline]
1964 pub unsafe fn set(&mut self, val: bool) {
1965 self.0 = val;
1966 }
1967
1968 #[inline]
1970 pub fn get(&self) -> bool {
1971 self.0
1972 }
1973}
1974
1975impl Default for UnsafeFlag {
1977 fn default() -> Self {
1978 Self::new()
1979 }
1980}
1981
1982#[derive(Debug)]
1984pub struct ArrayDataBuilder {
1985 data_type: DataType,
1986 len: usize,
1987 null_count: Option<usize>,
1988 null_bit_buffer: Option<Buffer>,
1989 nulls: Option<NullBuffer>,
1990 offset: usize,
1991 buffers: Vec<Buffer>,
1992 child_data: Vec<ArrayData>,
1993 align_buffers: bool,
1997 skip_validation: UnsafeFlag,
2007}
2008
2009impl ArrayDataBuilder {
2010 #[inline]
2011 pub const fn new(data_type: DataType) -> Self {
2013 Self {
2014 data_type,
2015 len: 0,
2016 null_count: None,
2017 null_bit_buffer: None,
2018 nulls: None,
2019 offset: 0,
2020 buffers: vec![],
2021 child_data: vec![],
2022 align_buffers: false,
2023 skip_validation: UnsafeFlag::new(),
2024 }
2025 }
2026
2027 pub fn data_type(self, data_type: DataType) -> Self {
2029 Self { data_type, ..self }
2030 }
2031
2032 #[inline]
2033 #[allow(clippy::len_without_is_empty)]
2034 pub const fn len(mut self, n: usize) -> Self {
2036 self.len = n;
2037 self
2038 }
2039
2040 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2042 self.nulls = nulls;
2043 self.null_count = None;
2044 self.null_bit_buffer = None;
2045 self
2046 }
2047
2048 pub fn null_count(mut self, null_count: usize) -> Self {
2050 self.null_count = Some(null_count);
2051 self
2052 }
2053
2054 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2056 self.nulls = None;
2057 self.null_bit_buffer = buf;
2058 self
2059 }
2060
2061 #[inline]
2063 pub const fn offset(mut self, n: usize) -> Self {
2064 self.offset = n;
2065 self
2066 }
2067
2068 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2070 self.buffers = v;
2071 self
2072 }
2073
2074 pub fn add_buffer(mut self, b: Buffer) -> Self {
2076 self.buffers.push(b);
2077 self
2078 }
2079
2080 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2082 self.buffers.extend(bs);
2083 self
2084 }
2085
2086 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2088 self.child_data = v;
2089 self
2090 }
2091
2092 pub fn add_child_data(mut self, r: ArrayData) -> Self {
2094 self.child_data.push(r);
2095 self
2096 }
2097
2098 pub unsafe fn build_unchecked(self) -> ArrayData {
2114 unsafe { self.skip_validation(true) }.build().unwrap()
2115 }
2116
2117 pub fn build(self) -> Result<ArrayData, ArrowError> {
2126 let Self {
2127 data_type,
2128 len,
2129 null_count,
2130 null_bit_buffer,
2131 nulls,
2132 offset,
2133 buffers,
2134 child_data,
2135 align_buffers,
2136 skip_validation,
2137 } = self;
2138
2139 let nulls = nulls
2140 .or_else(|| {
2141 let buffer = null_bit_buffer?;
2142 let buffer = BooleanBuffer::new(buffer, offset, len);
2143 Some(match null_count {
2144 Some(n) => {
2145 unsafe { NullBuffer::new_unchecked(buffer, n) }
2147 }
2148 None => NullBuffer::new(buffer),
2149 })
2150 })
2151 .filter(|b| b.null_count() != 0);
2152
2153 let mut data = ArrayData {
2154 data_type,
2155 len,
2156 offset,
2157 buffers,
2158 child_data,
2159 nulls,
2160 };
2161
2162 if align_buffers {
2163 data.align_buffers();
2164 }
2165
2166 if !skip_validation.get() || cfg!(feature = "force_validate") {
2168 data.validate_data()?;
2169 }
2170 Ok(data)
2171 }
2172
2173 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2175 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2176 self.align_buffers(true).build()
2177 }
2178
2179 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2195 self.align_buffers = align_buffers;
2196 self
2197 }
2198
2199 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2213 unsafe {
2214 self.skip_validation.set(skip_validation);
2215 }
2216 self
2217 }
2218}
2219
2220impl From<ArrayData> for ArrayDataBuilder {
2221 fn from(d: ArrayData) -> Self {
2222 Self {
2223 data_type: d.data_type,
2224 len: d.len,
2225 offset: d.offset,
2226 buffers: d.buffers,
2227 child_data: d.child_data,
2228 nulls: d.nulls,
2229 null_bit_buffer: None,
2230 null_count: None,
2231 align_buffers: false,
2232 skip_validation: UnsafeFlag::new(),
2233 }
2234 }
2235}
2236
2237#[cfg(test)]
2238mod tests {
2239 use super::*;
2240 use arrow_schema::{Field, Fields};
2241
2242 fn make_i32_buffer(n: usize) -> Buffer {
2246 Buffer::from_slice_ref(vec![42i32; n])
2247 }
2248
2249 fn make_f32_buffer(n: usize) -> Buffer {
2251 Buffer::from_slice_ref(vec![42f32; n])
2252 }
2253
2254 #[test]
2255 fn test_builder() {
2256 let v = (0..25).collect::<Vec<i32>>();
2258 let b1 = Buffer::from_slice_ref(&v);
2259 let arr_data = ArrayData::builder(DataType::Int32)
2260 .len(20)
2261 .offset(5)
2262 .add_buffer(b1)
2263 .null_bit_buffer(Some(Buffer::from([
2264 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2265 ])))
2266 .build()
2267 .unwrap();
2268
2269 assert_eq!(20, arr_data.len());
2270 assert_eq!(10, arr_data.null_count());
2271 assert_eq!(5, arr_data.offset());
2272 assert_eq!(1, arr_data.buffers().len());
2273 assert_eq!(
2274 Buffer::from_slice_ref(&v).as_slice(),
2275 arr_data.buffers()[0].as_slice()
2276 );
2277 }
2278
2279 #[test]
2280 fn test_builder_with_child_data() {
2281 let child_arr_data = ArrayData::try_new(
2282 DataType::Int32,
2283 5,
2284 None,
2285 0,
2286 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2287 vec![],
2288 )
2289 .unwrap();
2290
2291 let field = Arc::new(Field::new("x", DataType::Int32, true));
2292 let data_type = DataType::Struct(vec![field].into());
2293
2294 let arr_data = ArrayData::builder(data_type)
2295 .len(5)
2296 .offset(0)
2297 .add_child_data(child_arr_data.clone())
2298 .build()
2299 .unwrap();
2300
2301 assert_eq!(5, arr_data.len());
2302 assert_eq!(1, arr_data.child_data().len());
2303 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2304 }
2305
2306 #[test]
2307 fn test_null_count() {
2308 let mut bit_v: [u8; 2] = [0; 2];
2309 bit_util::set_bit(&mut bit_v, 0);
2310 bit_util::set_bit(&mut bit_v, 3);
2311 bit_util::set_bit(&mut bit_v, 10);
2312 let arr_data = ArrayData::builder(DataType::Int32)
2313 .len(16)
2314 .add_buffer(make_i32_buffer(16))
2315 .null_bit_buffer(Some(Buffer::from(bit_v)))
2316 .build()
2317 .unwrap();
2318 assert_eq!(13, arr_data.null_count());
2319
2320 let mut bit_v: [u8; 2] = [0; 2];
2322 bit_util::set_bit(&mut bit_v, 0);
2323 bit_util::set_bit(&mut bit_v, 3);
2324 bit_util::set_bit(&mut bit_v, 10);
2325 let arr_data = ArrayData::builder(DataType::Int32)
2326 .len(12)
2327 .offset(2)
2328 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2330 .build()
2331 .unwrap();
2332 assert_eq!(10, arr_data.null_count());
2333 }
2334
2335 #[test]
2336 fn test_null_buffer_ref() {
2337 let mut bit_v: [u8; 2] = [0; 2];
2338 bit_util::set_bit(&mut bit_v, 0);
2339 bit_util::set_bit(&mut bit_v, 3);
2340 bit_util::set_bit(&mut bit_v, 10);
2341 let arr_data = ArrayData::builder(DataType::Int32)
2342 .len(16)
2343 .add_buffer(make_i32_buffer(16))
2344 .null_bit_buffer(Some(Buffer::from(bit_v)))
2345 .build()
2346 .unwrap();
2347 assert!(arr_data.nulls().is_some());
2348 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2349 }
2350
2351 #[test]
2352 fn test_slice() {
2353 let mut bit_v: [u8; 2] = [0; 2];
2354 bit_util::set_bit(&mut bit_v, 0);
2355 bit_util::set_bit(&mut bit_v, 3);
2356 bit_util::set_bit(&mut bit_v, 10);
2357 let data = ArrayData::builder(DataType::Int32)
2358 .len(16)
2359 .add_buffer(make_i32_buffer(16))
2360 .null_bit_buffer(Some(Buffer::from(bit_v)))
2361 .build()
2362 .unwrap();
2363 let new_data = data.slice(1, 15);
2364 assert_eq!(data.len() - 1, new_data.len());
2365 assert_eq!(1, new_data.offset());
2366 assert_eq!(data.null_count(), new_data.null_count());
2367
2368 let new_data = new_data.slice(1, 14);
2370 assert_eq!(data.len() - 2, new_data.len());
2371 assert_eq!(2, new_data.offset());
2372 assert_eq!(data.null_count() - 1, new_data.null_count());
2373 }
2374
2375 #[test]
2376 fn test_equality() {
2377 let int_data = ArrayData::builder(DataType::Int32)
2378 .len(1)
2379 .add_buffer(make_i32_buffer(1))
2380 .build()
2381 .unwrap();
2382
2383 let float_data = ArrayData::builder(DataType::Float32)
2384 .len(1)
2385 .add_buffer(make_f32_buffer(1))
2386 .build()
2387 .unwrap();
2388 assert_ne!(int_data, float_data);
2389 assert!(!int_data.ptr_eq(&float_data));
2390 assert!(int_data.ptr_eq(&int_data));
2391
2392 #[allow(clippy::redundant_clone)]
2393 let int_data_clone = int_data.clone();
2394 assert_eq!(int_data, int_data_clone);
2395 assert!(int_data.ptr_eq(&int_data_clone));
2396 assert!(int_data_clone.ptr_eq(&int_data));
2397
2398 let int_data_slice = int_data_clone.slice(1, 0);
2399 assert!(int_data_slice.ptr_eq(&int_data_slice));
2400 assert!(!int_data.ptr_eq(&int_data_slice));
2401 assert!(!int_data_slice.ptr_eq(&int_data));
2402
2403 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2404 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2405 let string_data = ArrayData::try_new(
2406 DataType::Utf8,
2407 3,
2408 Some(Buffer::from_iter(vec![true, false, true])),
2409 0,
2410 vec![offsets_buffer, data_buffer],
2411 vec![],
2412 )
2413 .unwrap();
2414
2415 assert_ne!(float_data, string_data);
2416 assert!(!float_data.ptr_eq(&string_data));
2417
2418 assert!(string_data.ptr_eq(&string_data));
2419
2420 #[allow(clippy::redundant_clone)]
2421 let string_data_cloned = string_data.clone();
2422 assert!(string_data_cloned.ptr_eq(&string_data));
2423 assert!(string_data.ptr_eq(&string_data_cloned));
2424
2425 let string_data_slice = string_data.slice(1, 2);
2426 assert!(string_data_slice.ptr_eq(&string_data_slice));
2427 assert!(!string_data_slice.ptr_eq(&string_data))
2428 }
2429
2430 #[test]
2431 fn test_slice_memory_size() {
2432 let mut bit_v: [u8; 2] = [0; 2];
2433 bit_util::set_bit(&mut bit_v, 0);
2434 bit_util::set_bit(&mut bit_v, 3);
2435 bit_util::set_bit(&mut bit_v, 10);
2436 let data = ArrayData::builder(DataType::Int32)
2437 .len(16)
2438 .add_buffer(make_i32_buffer(16))
2439 .null_bit_buffer(Some(Buffer::from(bit_v)))
2440 .build()
2441 .unwrap();
2442 let new_data = data.slice(1, 14);
2443 assert_eq!(
2444 data.get_slice_memory_size().unwrap() - 8,
2445 new_data.get_slice_memory_size().unwrap()
2446 );
2447 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2448 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2449 let string_data = ArrayData::try_new(
2450 DataType::Utf8,
2451 3,
2452 Some(Buffer::from_iter(vec![true, false, true])),
2453 0,
2454 vec![offsets_buffer, data_buffer],
2455 vec![],
2456 )
2457 .unwrap();
2458 let string_data_slice = string_data.slice(1, 2);
2459 assert_eq!(
2461 string_data.get_slice_memory_size().unwrap() - 6,
2462 string_data_slice.get_slice_memory_size().unwrap()
2463 );
2464 }
2465
2466 #[test]
2467 fn test_count_nulls() {
2468 let buffer = Buffer::from([0b00010110, 0b10011111]);
2469 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2470 let count = count_nulls(Some(&buffer), 0, 16);
2471 assert_eq!(count, 7);
2472
2473 let count = count_nulls(Some(&buffer), 4, 8);
2474 assert_eq!(count, 3);
2475 }
2476
2477 #[test]
2478 fn test_contains_nulls() {
2479 let buffer: Buffer =
2480 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2481 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2482 assert!(contains_nulls(Some(&buffer), 0, 6));
2483 assert!(contains_nulls(Some(&buffer), 0, 3));
2484 assert!(!contains_nulls(Some(&buffer), 3, 2));
2485 assert!(!contains_nulls(Some(&buffer), 0, 0));
2486 }
2487
2488 #[test]
2489 fn test_alignment() {
2490 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2491 let sliced = buffer.slice(1);
2492
2493 let mut data = ArrayData {
2494 data_type: DataType::Int32,
2495 len: 0,
2496 offset: 0,
2497 buffers: vec![buffer],
2498 child_data: vec![],
2499 nulls: None,
2500 };
2501 data.validate_full().unwrap();
2502
2503 data.buffers[0] = sliced;
2505 let err = data.validate().unwrap_err();
2506
2507 assert_eq!(
2508 err.to_string(),
2509 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2510 );
2511
2512 data.align_buffers();
2513 data.validate_full().unwrap();
2514 }
2515
2516 #[test]
2517 fn test_alignment_struct() {
2518 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2519 let sliced = buffer.slice(1);
2520
2521 let child_data = ArrayData {
2522 data_type: DataType::Int32,
2523 len: 0,
2524 offset: 0,
2525 buffers: vec![buffer],
2526 child_data: vec![],
2527 nulls: None,
2528 };
2529
2530 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2531 let mut data = ArrayData {
2532 data_type: schema,
2533 len: 0,
2534 offset: 0,
2535 buffers: vec![],
2536 child_data: vec![child_data],
2537 nulls: None,
2538 };
2539 data.validate_full().unwrap();
2540
2541 data.child_data[0].buffers[0] = sliced;
2543 let err = data.validate().unwrap_err();
2544
2545 assert_eq!(
2546 err.to_string(),
2547 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2548 );
2549
2550 data.align_buffers();
2551 data.validate_full().unwrap();
2552 }
2553
2554 #[test]
2555 fn test_null_view_types() {
2556 let array_len = 32;
2557 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2558 assert_eq!(array.len(), array_len);
2559 for i in 0..array.len() {
2560 assert!(array.is_null(i));
2561 }
2562
2563 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2564 assert_eq!(array.len(), array_len);
2565 for i in 0..array.len() {
2566 assert!(array.is_null(i));
2567 }
2568
2569 let array = ArrayData::new_null(
2570 &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2571 array_len,
2572 );
2573 assert_eq!(array.len(), array_len);
2574 for i in 0..array.len() {
2575 assert!(array.is_null(i));
2576 }
2577
2578 let array = ArrayData::new_null(
2579 &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2580 array_len,
2581 );
2582 assert_eq!(array.len(), array_len);
2583 for i in 0..array.len() {
2584 assert!(array.is_null(i));
2585 }
2586 }
2587}