1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 unsafe { skip_validation.set(true) };
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
313 data_type: DataType,
314 len: usize,
315 null_bit_buffer: Option<Buffer>,
316 offset: usize,
317 buffers: Vec<Buffer>,
318 child_data: Vec<ArrayData>,
319 ) -> Result<Self, ArrowError> {
320 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324 let needed_len = bit_util::ceil(len + offset, 8);
325 if null_bit_buffer.len() < needed_len {
326 return Err(ArrowError::InvalidArgumentError(format!(
327 "null_bit_buffer size too small. got {} needed {}",
328 null_bit_buffer.len(),
329 needed_len
330 )));
331 }
332 }
333 let new_self = unsafe {
335 Self::new_unchecked(
336 data_type,
337 len,
338 None,
339 null_bit_buffer,
340 offset,
341 buffers,
342 child_data,
343 )
344 };
345
346 new_self.validate_data()?;
351 Ok(new_self)
352 }
353
354 #[inline]
356 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357 ArrayDataBuilder::new(data_type)
358 }
359
360 #[inline]
362 pub const fn data_type(&self) -> &DataType {
363 &self.data_type
364 }
365
366 pub fn buffers(&self) -> &[Buffer] {
368 &self.buffers
369 }
370
371 pub fn child_data(&self) -> &[ArrayData] {
374 &self.child_data[..]
375 }
376
377 #[inline]
379 pub fn is_null(&self, i: usize) -> bool {
380 match &self.nulls {
381 Some(v) => v.is_null(i),
382 None => false,
383 }
384 }
385
386 #[inline]
390 pub fn nulls(&self) -> Option<&NullBuffer> {
391 self.nulls.as_ref()
392 }
393
394 #[inline]
396 pub fn is_valid(&self, i: usize) -> bool {
397 !self.is_null(i)
398 }
399
400 #[inline]
402 pub const fn len(&self) -> usize {
403 self.len
404 }
405
406 #[inline]
408 pub const fn is_empty(&self) -> bool {
409 self.len == 0
410 }
411
412 #[inline]
414 pub const fn offset(&self) -> usize {
415 self.offset
416 }
417
418 #[inline]
420 pub fn null_count(&self) -> usize {
421 self.nulls
422 .as_ref()
423 .map(|x| x.null_count())
424 .unwrap_or_default()
425 }
426
427 pub fn get_buffer_memory_size(&self) -> usize {
439 let mut size = 0;
440 for buffer in &self.buffers {
441 size += buffer.capacity();
442 }
443 if let Some(bitmap) = &self.nulls {
444 size += bitmap.buffer().capacity()
445 }
446 for child in &self.child_data {
447 size += child.get_buffer_memory_size();
448 }
449 size
450 }
451
452 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465 let mut result: usize = 0;
466 let layout = layout(&self.data_type);
467
468 for spec in layout.buffers.iter() {
469 match spec {
470 BufferSpec::FixedWidth { byte_width, .. } => {
471 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472 ArrowError::ComputeError(
473 "Integer overflow computing buffer size".to_string(),
474 )
475 })?;
476 result += buffer_size;
477 }
478 BufferSpec::VariableWidth => {
479 let buffer_len = match self.data_type {
480 DataType::Utf8 | DataType::Binary => {
481 let offsets = self.typed_offsets::<i32>()?;
482 (offsets[self.len] - offsets[0]) as usize
483 }
484 DataType::LargeUtf8 | DataType::LargeBinary => {
485 let offsets = self.typed_offsets::<i64>()?;
486 (offsets[self.len] - offsets[0]) as usize
487 }
488 _ => {
489 return Err(ArrowError::NotYetImplemented(format!(
490 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491 self.data_type
492 )));
493 }
494 };
495 result += buffer_len;
496 }
497 BufferSpec::BitMap => {
498 let buffer_size = bit_util::ceil(self.len, 8);
499 result += buffer_size;
500 }
501 BufferSpec::AlwaysNull => {
502 }
504 }
505 }
506
507 if self.nulls().is_some() {
508 result += bit_util::ceil(self.len, 8);
509 }
510
511 for child in &self.child_data {
512 result += child.get_slice_memory_size()?;
513 }
514 Ok(result)
515 }
516
517 pub fn get_array_memory_size(&self) -> usize {
526 let mut size = mem::size_of_val(self);
527
528 for buffer in &self.buffers {
530 size += mem::size_of::<Buffer>();
531 size += buffer.capacity();
532 }
533 if let Some(nulls) = &self.nulls {
534 size += nulls.buffer().capacity();
535 }
536 for child in &self.child_data {
537 size += child.get_array_memory_size();
538 }
539
540 size
541 }
542
543 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551 assert!((offset + length) <= self.len());
552
553 if let DataType::Struct(_) = self.data_type() {
554 let new_offset = self.offset + offset;
556 ArrayData {
557 data_type: self.data_type().clone(),
558 len: length,
559 offset: new_offset,
560 buffers: self.buffers.clone(),
561 child_data: self
563 .child_data()
564 .iter()
565 .map(|data| data.slice(offset, length))
566 .collect(),
567 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568 }
569 } else {
570 let mut new_data = self.clone();
571
572 new_data.len = length;
573 new_data.offset = offset + self.offset;
574 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
575
576 new_data
577 }
578 }
579
580 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587 &self.buffers()[buffer].typed_data()[self.offset..]
588 }
589
590 pub fn new_null(data_type: &DataType, len: usize) -> Self {
592 let bit_len = bit_util::ceil(len, 8);
593 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596 Some(width) => (vec![zeroed(width * len)], vec![], true),
597 None => match data_type {
598 DataType::Null => (vec![], vec![], false),
599 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600 DataType::Binary | DataType::Utf8 => {
601 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602 }
603 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604 DataType::LargeBinary | DataType::LargeUtf8 => {
605 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606 }
607 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608 DataType::List(f) | DataType::Map(f, _) => (
609 vec![zeroed((len + 1) * 4)],
610 vec![ArrayData::new_empty(f.data_type())],
611 true,
612 ),
613 DataType::LargeList(f) => (
614 vec![zeroed((len + 1) * 8)],
615 vec![ArrayData::new_empty(f.data_type())],
616 true,
617 ),
618 DataType::ListView(f) => (
619 vec![zeroed(len * 4), zeroed(len * 4)],
620 vec![ArrayData::new_empty(f.data_type())],
621 true,
622 ),
623 DataType::LargeListView(f) => (
624 vec![zeroed(len * 8), zeroed(len * 8)],
625 vec![ArrayData::new_empty(f.data_type())],
626 true,
627 ),
628 DataType::FixedSizeList(f, list_len) => (
629 vec![],
630 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
631 true,
632 ),
633 DataType::Struct(fields) => (
634 vec![],
635 fields
636 .iter()
637 .map(|f| Self::new_null(f.data_type(), len))
638 .collect(),
639 true,
640 ),
641 DataType::Dictionary(k, v) => (
642 vec![zeroed(k.primitive_width().unwrap() * len)],
643 vec![ArrayData::new_empty(v.as_ref())],
644 true,
645 ),
646 DataType::Union(f, mode) => {
647 let (id, _) = f.iter().next().unwrap();
648 let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
649 let buffers = match mode {
650 UnionMode::Sparse => vec![ids],
651 UnionMode::Dense => {
652 let end_offset = i32::from_usize(len).unwrap();
653 vec![ids, Buffer::from_iter(0_i32..end_offset)]
654 }
655 };
656
657 let children = f
658 .iter()
659 .enumerate()
660 .map(|(idx, (_, f))| {
661 if idx == 0 || *mode == UnionMode::Sparse {
662 Self::new_null(f.data_type(), len)
663 } else {
664 Self::new_empty(f.data_type())
665 }
666 })
667 .collect();
668
669 (buffers, children, false)
670 }
671 DataType::RunEndEncoded(r, v) => {
672 let runs = match r.data_type() {
673 DataType::Int16 => {
674 let i = i16::from_usize(len).expect("run overflow");
675 Buffer::from_slice_ref([i])
676 }
677 DataType::Int32 => {
678 let i = i32::from_usize(len).expect("run overflow");
679 Buffer::from_slice_ref([i])
680 }
681 DataType::Int64 => {
682 let i = i64::from_usize(len).expect("run overflow");
683 Buffer::from_slice_ref([i])
684 }
685 dt => unreachable!("Invalid run ends data type {dt}"),
686 };
687
688 let builder = ArrayData::builder(r.data_type().clone())
689 .len(1)
690 .buffers(vec![runs]);
691
692 let runs = unsafe { builder.build_unchecked() };
695 (
696 vec![],
697 vec![runs, ArrayData::new_null(v.data_type(), 1)],
698 false,
699 )
700 }
701 d => unreachable!("{d}"),
702 },
703 };
704
705 let mut builder = ArrayDataBuilder::new(data_type.clone())
706 .len(len)
707 .buffers(buffers)
708 .child_data(child_data);
709
710 if has_nulls {
711 builder = builder.nulls(Some(NullBuffer::new_null(len)))
712 }
713
714 unsafe { builder.build_unchecked() }
717 }
718
719 pub fn new_empty(data_type: &DataType) -> Self {
721 Self::new_null(data_type, 0)
722 }
723
724 pub fn align_buffers(&mut self) {
733 let layout = layout(&self.data_type);
734 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
735 if let BufferSpec::FixedWidth { alignment, .. } = spec {
736 if buffer.as_ptr().align_offset(*alignment) != 0 {
737 *buffer = Buffer::from_slice_ref(buffer.as_ref());
738 }
739 }
740 }
741 for data in self.child_data.iter_mut() {
743 data.align_buffers()
744 }
745 }
746
747 pub fn validate(&self) -> Result<(), ArrowError> {
758 let len_plus_offset = self.len + self.offset;
760
761 let layout = layout(&self.data_type);
763
764 if !layout.can_contain_null_mask && self.nulls.is_some() {
765 return Err(ArrowError::InvalidArgumentError(format!(
766 "Arrays of type {:?} cannot contain a null bitmask",
767 self.data_type,
768 )));
769 }
770
771 if self.buffers.len() < layout.buffers.len()
773 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
774 {
775 return Err(ArrowError::InvalidArgumentError(format!(
776 "Expected {} buffers in array of type {:?}, got {}",
777 layout.buffers.len(),
778 self.data_type,
779 self.buffers.len(),
780 )));
781 }
782
783 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
784 match spec {
785 BufferSpec::FixedWidth {
786 byte_width,
787 alignment,
788 } => {
789 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
790
791 if buffer.len() < min_buffer_size {
792 return Err(ArrowError::InvalidArgumentError(format!(
793 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
794 min_buffer_size,
795 i,
796 self.data_type,
797 buffer.len()
798 )));
799 }
800
801 let align_offset = buffer.as_ptr().align_offset(*alignment);
802 if align_offset != 0 {
803 return Err(ArrowError::InvalidArgumentError(format!(
804 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
805 self.data_type,
806 align_offset.min(alignment - align_offset)
807 )));
808 }
809 }
810 BufferSpec::VariableWidth => {
811 }
815 BufferSpec::BitMap => {
816 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
817 if buffer.len() < min_buffer_size {
818 return Err(ArrowError::InvalidArgumentError(format!(
819 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
820 min_buffer_size,
821 i,
822 self.data_type,
823 buffer.len()
824 )));
825 }
826 }
827 BufferSpec::AlwaysNull => {
828 }
830 }
831 }
832
833 if let Some(nulls) = self.nulls() {
835 if nulls.null_count() > self.len {
836 return Err(ArrowError::InvalidArgumentError(format!(
837 "null_count {} for an array exceeds length of {} elements",
838 nulls.null_count(),
839 self.len
840 )));
841 }
842
843 let actual_len = nulls.validity().len();
844 let needed_len = bit_util::ceil(len_plus_offset, 8);
845 if actual_len < needed_len {
846 return Err(ArrowError::InvalidArgumentError(format!(
847 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
848 )));
849 }
850
851 if nulls.len() != self.len {
852 return Err(ArrowError::InvalidArgumentError(format!(
853 "null buffer incorrect size. got {} expected {}",
854 nulls.len(),
855 self.len
856 )));
857 }
858 }
859
860 self.validate_child_data()?;
861
862 match &self.data_type {
864 DataType::Utf8 | DataType::Binary => {
865 self.validate_offsets::<i32>(self.buffers[1].len())?;
866 }
867 DataType::LargeUtf8 | DataType::LargeBinary => {
868 self.validate_offsets::<i64>(self.buffers[1].len())?;
869 }
870 DataType::Dictionary(key_type, _value_type) => {
871 if !DataType::is_dictionary_key_type(key_type) {
873 return Err(ArrowError::InvalidArgumentError(format!(
874 "Dictionary key type must be integer, but was {key_type}"
875 )));
876 }
877 }
878 DataType::RunEndEncoded(run_ends_type, _) => {
879 if run_ends_type.is_nullable() {
880 return Err(ArrowError::InvalidArgumentError(
881 "The nullable should be set to false for the field defining run_ends array.".to_string()
882 ));
883 }
884 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
885 return Err(ArrowError::InvalidArgumentError(format!(
886 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
887 run_ends_type.data_type()
888 )));
889 }
890 }
891 _ => {}
892 };
893
894 Ok(())
895 }
896
897 fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
904 if self.len == 0 && self.buffers[0].is_empty() {
906 return Ok(&[]);
907 }
908
909 self.typed_buffer(0, self.len + 1)
910 }
911
912 fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
914 &self,
915 idx: usize,
916 len: usize,
917 ) -> Result<&[T], ArrowError> {
918 let buffer = &self.buffers[idx];
919
920 let required_len = (len + self.offset) * mem::size_of::<T>();
921
922 if buffer.len() < required_len {
923 return Err(ArrowError::InvalidArgumentError(format!(
924 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
925 idx,
926 self.data_type,
927 required_len,
928 buffer.len()
929 )));
930 }
931
932 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
933 }
934
935 fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
938 &self,
939 values_length: usize,
940 ) -> Result<(), ArrowError> {
941 let offsets = self.typed_offsets::<T>()?;
943 if offsets.is_empty() {
944 return Ok(());
945 }
946
947 let first_offset = offsets[0].to_usize().ok_or_else(|| {
948 ArrowError::InvalidArgumentError(format!(
949 "Error converting offset[0] ({}) to usize for {}",
950 offsets[0], self.data_type
951 ))
952 })?;
953
954 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
955 ArrowError::InvalidArgumentError(format!(
956 "Error converting offset[{}] ({}) to usize for {}",
957 self.len, offsets[self.len], self.data_type
958 ))
959 })?;
960
961 if first_offset > values_length {
962 return Err(ArrowError::InvalidArgumentError(format!(
963 "First offset {} of {} is larger than values length {}",
964 first_offset, self.data_type, values_length,
965 )));
966 }
967
968 if last_offset > values_length {
969 return Err(ArrowError::InvalidArgumentError(format!(
970 "Last offset {} of {} is larger than values length {}",
971 last_offset, self.data_type, values_length,
972 )));
973 }
974
975 if first_offset > last_offset {
976 return Err(ArrowError::InvalidArgumentError(format!(
977 "First offset {} in {} is smaller than last offset {}",
978 first_offset, self.data_type, last_offset,
979 )));
980 }
981
982 Ok(())
983 }
984
985 fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
988 &self,
989 values_length: usize,
990 ) -> Result<(), ArrowError> {
991 let offsets: &[T] = self.typed_buffer(0, self.len)?;
992 let sizes: &[T] = self.typed_buffer(1, self.len)?;
993 if offsets.len() != sizes.len() {
994 return Err(ArrowError::ComputeError(format!(
995 "ListView offsets len {} does not match sizes len {}",
996 offsets.len(),
997 sizes.len()
998 )));
999 }
1000
1001 for i in 0..sizes.len() {
1002 let size = sizes[i].to_usize().ok_or_else(|| {
1003 ArrowError::InvalidArgumentError(format!(
1004 "Error converting size[{}] ({}) to usize for {}",
1005 i, sizes[i], self.data_type
1006 ))
1007 })?;
1008 let offset = offsets[i].to_usize().ok_or_else(|| {
1009 ArrowError::InvalidArgumentError(format!(
1010 "Error converting offset[{}] ({}) to usize for {}",
1011 i, offsets[i], self.data_type
1012 ))
1013 })?;
1014 if size
1015 .checked_add(offset)
1016 .expect("Offset and size have exceeded the usize boundary")
1017 > values_length
1018 {
1019 return Err(ArrowError::InvalidArgumentError(format!(
1020 "Size {} at index {} is larger than the remaining values for {}",
1021 size, i, self.data_type
1022 )));
1023 }
1024 }
1025 Ok(())
1026 }
1027
1028 fn validate_child_data(&self) -> Result<(), ArrowError> {
1030 match &self.data_type {
1031 DataType::List(field) | DataType::Map(field, _) => {
1032 let values_data = self.get_single_valid_child_data(field.data_type())?;
1033 self.validate_offsets::<i32>(values_data.len)?;
1034 Ok(())
1035 }
1036 DataType::LargeList(field) => {
1037 let values_data = self.get_single_valid_child_data(field.data_type())?;
1038 self.validate_offsets::<i64>(values_data.len)?;
1039 Ok(())
1040 }
1041 DataType::ListView(field) => {
1042 let values_data = self.get_single_valid_child_data(field.data_type())?;
1043 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1044 Ok(())
1045 }
1046 DataType::LargeListView(field) => {
1047 let values_data = self.get_single_valid_child_data(field.data_type())?;
1048 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1049 Ok(())
1050 }
1051 DataType::FixedSizeList(field, list_size) => {
1052 let values_data = self.get_single_valid_child_data(field.data_type())?;
1053
1054 let list_size: usize = (*list_size).try_into().map_err(|_| {
1055 ArrowError::InvalidArgumentError(format!(
1056 "{} has a negative list_size {}",
1057 self.data_type, list_size
1058 ))
1059 })?;
1060
1061 let expected_values_len = self.len
1062 .checked_mul(list_size)
1063 .expect("integer overflow computing expected number of expected values in FixedListSize");
1064
1065 if values_data.len < expected_values_len {
1066 return Err(ArrowError::InvalidArgumentError(format!(
1067 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1068 values_data.len, self.len, list_size, self.data_type
1069 )));
1070 }
1071
1072 Ok(())
1073 }
1074 DataType::Struct(fields) => {
1075 self.validate_num_child_data(fields.len())?;
1076 for (i, field) in fields.iter().enumerate() {
1077 let field_data = self.get_valid_child_data(i, field.data_type())?;
1078
1079 if field_data.len < self.len {
1081 return Err(ArrowError::InvalidArgumentError(format!(
1082 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1083 self.data_type,
1084 i,
1085 field.name(),
1086 field_data.len,
1087 self.len
1088 )));
1089 }
1090 }
1091 Ok(())
1092 }
1093 DataType::RunEndEncoded(run_ends_field, values_field) => {
1094 self.validate_num_child_data(2)?;
1095 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1096 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1097 if run_ends_data.len != values_data.len {
1098 return Err(ArrowError::InvalidArgumentError(format!(
1099 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1100 run_ends_data.len, values_data.len
1101 )));
1102 }
1103 if run_ends_data.nulls.is_some() {
1104 return Err(ArrowError::InvalidArgumentError(
1105 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1106 ));
1107 }
1108 Ok(())
1109 }
1110 DataType::Union(fields, mode) => {
1111 self.validate_num_child_data(fields.len())?;
1112
1113 for (i, (_, field)) in fields.iter().enumerate() {
1114 let field_data = self.get_valid_child_data(i, field.data_type())?;
1115
1116 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1117 return Err(ArrowError::InvalidArgumentError(format!(
1118 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1119 i,
1120 field_data.len,
1121 self.len + self.offset
1122 )));
1123 }
1124 }
1125 Ok(())
1126 }
1127 DataType::Dictionary(_key_type, value_type) => {
1128 self.get_single_valid_child_data(value_type)?;
1129 Ok(())
1130 }
1131 _ => {
1132 if !self.child_data.is_empty() {
1134 return Err(ArrowError::InvalidArgumentError(format!(
1135 "Expected no child arrays for type {} but got {}",
1136 self.data_type,
1137 self.child_data.len()
1138 )));
1139 }
1140 Ok(())
1141 }
1142 }
1143 }
1144
1145 fn get_single_valid_child_data(
1149 &self,
1150 expected_type: &DataType,
1151 ) -> Result<&ArrayData, ArrowError> {
1152 self.validate_num_child_data(1)?;
1153 self.get_valid_child_data(0, expected_type)
1154 }
1155
1156 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1158 if self.child_data.len() != expected_len {
1159 Err(ArrowError::InvalidArgumentError(format!(
1160 "Value data for {} should contain {} child data array(s), had {}",
1161 self.data_type,
1162 expected_len,
1163 self.child_data.len()
1164 )))
1165 } else {
1166 Ok(())
1167 }
1168 }
1169
1170 fn get_valid_child_data(
1173 &self,
1174 i: usize,
1175 expected_type: &DataType,
1176 ) -> Result<&ArrayData, ArrowError> {
1177 let values_data = self.child_data.get(i).ok_or_else(|| {
1178 ArrowError::InvalidArgumentError(format!(
1179 "{} did not have enough child arrays. Expected at least {} but had only {}",
1180 self.data_type,
1181 i + 1,
1182 self.child_data.len()
1183 ))
1184 })?;
1185
1186 if expected_type != &values_data.data_type {
1187 return Err(ArrowError::InvalidArgumentError(format!(
1188 "Child type mismatch for {}. Expected {} but child data had {}",
1189 self.data_type, expected_type, values_data.data_type
1190 )));
1191 }
1192
1193 values_data.validate()?;
1194 Ok(values_data)
1195 }
1196
1197 pub fn validate_data(&self) -> Result<(), ArrowError> {
1213 self.validate()?;
1214
1215 self.validate_nulls()?;
1216 self.validate_values()?;
1217 Ok(())
1218 }
1219
1220 pub fn validate_full(&self) -> Result<(), ArrowError> {
1225 self.validate_data()?;
1226 self.child_data
1228 .iter()
1229 .enumerate()
1230 .try_for_each(|(i, child_data)| {
1231 child_data.validate_full().map_err(|e| {
1232 ArrowError::InvalidArgumentError(format!(
1233 "{} child #{} invalid: {}",
1234 self.data_type, i, e
1235 ))
1236 })
1237 })?;
1238 Ok(())
1239 }
1240
1241 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1251 if let Some(nulls) = &self.nulls {
1252 let actual = nulls.len() - nulls.inner().count_set_bits();
1253 if actual != nulls.null_count() {
1254 return Err(ArrowError::InvalidArgumentError(format!(
1255 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1256 nulls.null_count(),
1257 actual
1258 )));
1259 }
1260 }
1261
1262 match &self.data_type {
1267 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1268 if !f.is_nullable() {
1269 self.validate_non_nullable(None, &self.child_data[0])?
1270 }
1271 }
1272 DataType::FixedSizeList(field, len) => {
1273 let child = &self.child_data[0];
1274 if !field.is_nullable() {
1275 match &self.nulls {
1276 Some(nulls) => {
1277 let element_len = *len as usize;
1278 let expanded = nulls.expand(element_len);
1279 self.validate_non_nullable(Some(&expanded), child)?;
1280 }
1281 None => self.validate_non_nullable(None, child)?,
1282 }
1283 }
1284 }
1285 DataType::Struct(fields) => {
1286 for (field, child) in fields.iter().zip(&self.child_data) {
1287 if !field.is_nullable() {
1288 self.validate_non_nullable(self.nulls(), child)?
1289 }
1290 }
1291 }
1292 _ => {}
1293 }
1294
1295 Ok(())
1296 }
1297
1298 fn validate_non_nullable(
1300 &self,
1301 mask: Option<&NullBuffer>,
1302 child: &ArrayData,
1303 ) -> Result<(), ArrowError> {
1304 let mask = match mask {
1305 Some(mask) => mask,
1306 None => {
1307 return match child.null_count() {
1308 0 => Ok(()),
1309 _ => Err(ArrowError::InvalidArgumentError(format!(
1310 "non-nullable child of type {} contains nulls not present in parent {}",
1311 child.data_type, self.data_type
1312 ))),
1313 };
1314 }
1315 };
1316
1317 match child.nulls() {
1318 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1319 "non-nullable child of type {} contains nulls not present in parent",
1320 child.data_type
1321 ))),
1322 _ => Ok(()),
1323 }
1324 }
1325
1326 pub fn validate_values(&self) -> Result<(), ArrowError> {
1332 match &self.data_type {
1333 DataType::Utf8 => self.validate_utf8::<i32>(),
1334 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1335 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1336 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1337 DataType::BinaryView => {
1338 let views = self.typed_buffer::<u128>(0, self.len)?;
1339 validate_binary_view(views, &self.buffers[1..])
1340 }
1341 DataType::Utf8View => {
1342 let views = self.typed_buffer::<u128>(0, self.len)?;
1343 validate_string_view(views, &self.buffers[1..])
1344 }
1345 DataType::List(_) | DataType::Map(_, _) => {
1346 let child = &self.child_data[0];
1347 self.validate_offsets_full::<i32>(child.len)
1348 }
1349 DataType::LargeList(_) => {
1350 let child = &self.child_data[0];
1351 self.validate_offsets_full::<i64>(child.len)
1352 }
1353 DataType::Union(_, _) => {
1354 Ok(())
1360 }
1361 DataType::Dictionary(key_type, _value_type) => {
1362 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1363 let max_value = dictionary_length - 1;
1364 match key_type.as_ref() {
1365 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1366 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1367 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1368 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1369 DataType::Int8 => self.check_bounds::<i8>(max_value),
1370 DataType::Int16 => self.check_bounds::<i16>(max_value),
1371 DataType::Int32 => self.check_bounds::<i32>(max_value),
1372 DataType::Int64 => self.check_bounds::<i64>(max_value),
1373 _ => unreachable!(),
1374 }
1375 }
1376 DataType::RunEndEncoded(run_ends, _values) => {
1377 let run_ends_data = self.child_data()[0].clone();
1378 match run_ends.data_type() {
1379 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1380 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1381 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1382 _ => unreachable!(),
1383 }
1384 }
1385 _ => {
1386 Ok(())
1388 }
1389 }
1390 }
1391
1392 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1403 where
1404 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1405 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1406 {
1407 self.typed_offsets::<T>()?
1408 .iter()
1409 .enumerate()
1410 .map(|(i, x)| {
1411 let r = x.to_usize().ok_or_else(|| {
1413 ArrowError::InvalidArgumentError(format!(
1414 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1415 );
1416 match r {
1418 Ok(n) if n <= offset_limit => Ok((i, n)),
1419 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1420 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1421 ),
1422 Err(e) => Err(e),
1423 }
1424 })
1425 .scan(0_usize, |start, end| {
1426 match end {
1428 Ok((i, end)) if *start <= end => {
1429 let range = Some(Ok((i, *start..end)));
1430 *start = end;
1431 range
1432 }
1433 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1434 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1435 i - 1, start, end))
1436 )),
1437 Err(err) => Some(Err(err)),
1438 }
1439 })
1440 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1442 let (item_index, range) = res?;
1443 validate(item_index-1, range)
1444 })
1445 }
1446
1447 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1450 where
1451 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1452 {
1453 let values_buffer = &self.buffers[1].as_slice();
1454 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1455 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1457 if !values_str.is_char_boundary(range.start)
1458 || !values_str.is_char_boundary(range.end)
1459 {
1460 return Err(ArrowError::InvalidArgumentError(format!(
1461 "incomplete utf-8 byte sequence from index {string_index}"
1462 )));
1463 }
1464 Ok(())
1465 })
1466 } else {
1467 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1469 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1470 ArrowError::InvalidArgumentError(format!(
1471 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1472 ))
1473 })?;
1474 Ok(())
1475 })
1476 }
1477 }
1478
1479 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1482 where
1483 T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1484 {
1485 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1486 Ok(())
1489 })
1490 }
1491
1492 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1495 where
1496 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1497 {
1498 let required_len = self.len + self.offset;
1499 let buffer = &self.buffers[0];
1500
1501 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1504
1505 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1507
1508 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1509 if self.is_null(i) {
1511 return Ok(());
1512 }
1513 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1514 ArrowError::InvalidArgumentError(format!(
1515 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1516 ))
1517 })?;
1518
1519 if dict_index < 0 || dict_index > max_value {
1520 return Err(ArrowError::InvalidArgumentError(format!(
1521 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1522 )));
1523 }
1524 Ok(())
1525 })
1526 }
1527
1528 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1530 where
1531 T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1532 {
1533 let values = self.typed_buffer::<T>(0, self.len)?;
1534 let mut prev_value: i64 = 0_i64;
1535 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1536 let value: i64 = inp_value.try_into().map_err(|_| {
1537 ArrowError::InvalidArgumentError(format!(
1538 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1539 ))
1540 })?;
1541 if value <= 0_i64 {
1542 return Err(ArrowError::InvalidArgumentError(format!(
1543 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1544 )));
1545 }
1546 if ix > 0 && value <= prev_value {
1547 return Err(ArrowError::InvalidArgumentError(format!(
1548 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1549 )));
1550 }
1551
1552 prev_value = value;
1553 Ok(())
1554 })?;
1555
1556 if prev_value.as_usize() < (self.offset + self.len) {
1557 return Err(ArrowError::InvalidArgumentError(format!(
1558 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1559 self.offset + self.len
1560 )));
1561 }
1562 Ok(())
1563 }
1564
1565 pub fn ptr_eq(&self, other: &Self) -> bool {
1569 if self.offset != other.offset
1570 || self.len != other.len
1571 || self.data_type != other.data_type
1572 || self.buffers.len() != other.buffers.len()
1573 || self.child_data.len() != other.child_data.len()
1574 {
1575 return false;
1576 }
1577
1578 match (&self.nulls, &other.nulls) {
1579 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1580 (Some(_), None) | (None, Some(_)) => return false,
1581 _ => {}
1582 };
1583
1584 if !self
1585 .buffers
1586 .iter()
1587 .zip(other.buffers.iter())
1588 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1589 {
1590 return false;
1591 }
1592
1593 self.child_data
1594 .iter()
1595 .zip(other.child_data.iter())
1596 .all(|(a, b)| a.ptr_eq(b))
1597 }
1598
1599 pub fn into_builder(self) -> ArrayDataBuilder {
1601 self.into()
1602 }
1603}
1604
1605pub fn layout(data_type: &DataType) -> DataTypeLayout {
1608 use arrow_schema::IntervalUnit::*;
1611
1612 match data_type {
1613 DataType::Null => DataTypeLayout {
1614 buffers: vec![],
1615 can_contain_null_mask: false,
1616 variadic: false,
1617 },
1618 DataType::Boolean => DataTypeLayout {
1619 buffers: vec![BufferSpec::BitMap],
1620 can_contain_null_mask: true,
1621 variadic: false,
1622 },
1623 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1624 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1625 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1626 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1627 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1628 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1629 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1630 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1631 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1632 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1633 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1634 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1635 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1636 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1637 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1638 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1639 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1640 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1641 DataType::Interval(MonthDayNano) => {
1642 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1643 }
1644 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1645 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1646 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1647 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1648 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1649 DataType::FixedSizeBinary(size) => {
1650 let spec = BufferSpec::FixedWidth {
1651 byte_width: (*size).try_into().unwrap(),
1652 alignment: mem::align_of::<u8>(),
1653 };
1654 DataTypeLayout {
1655 buffers: vec![spec],
1656 can_contain_null_mask: true,
1657 variadic: false,
1658 }
1659 }
1660 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1661 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1662 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1663 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1664 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1665 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1667 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1668 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1669 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1670 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1671 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1674 let type_ids = BufferSpec::FixedWidth {
1675 byte_width: mem::size_of::<i8>(),
1676 alignment: mem::align_of::<i8>(),
1677 };
1678
1679 DataTypeLayout {
1680 buffers: match mode {
1681 UnionMode::Sparse => {
1682 vec![type_ids]
1683 }
1684 UnionMode::Dense => {
1685 vec![
1686 type_ids,
1687 BufferSpec::FixedWidth {
1688 byte_width: mem::size_of::<i32>(),
1689 alignment: mem::align_of::<i32>(),
1690 },
1691 ]
1692 }
1693 },
1694 can_contain_null_mask: false,
1695 variadic: false,
1696 }
1697 }
1698 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1699 }
1700}
1701
1702#[derive(Debug, PartialEq, Eq)]
1704pub struct DataTypeLayout {
1706 pub buffers: Vec<BufferSpec>,
1708
1709 pub can_contain_null_mask: bool,
1711
1712 pub variadic: bool,
1716}
1717
1718impl DataTypeLayout {
1719 pub fn new_fixed_width<T>() -> Self {
1721 Self {
1722 buffers: vec![BufferSpec::FixedWidth {
1723 byte_width: mem::size_of::<T>(),
1724 alignment: mem::align_of::<T>(),
1725 }],
1726 can_contain_null_mask: true,
1727 variadic: false,
1728 }
1729 }
1730
1731 pub fn new_nullable_empty() -> Self {
1734 Self {
1735 buffers: vec![],
1736 can_contain_null_mask: true,
1737 variadic: false,
1738 }
1739 }
1740
1741 pub fn new_empty() -> Self {
1744 Self {
1745 buffers: vec![],
1746 can_contain_null_mask: false,
1747 variadic: false,
1748 }
1749 }
1750
1751 pub fn new_binary<T>() -> Self {
1755 Self {
1756 buffers: vec![
1757 BufferSpec::FixedWidth {
1759 byte_width: mem::size_of::<T>(),
1760 alignment: mem::align_of::<T>(),
1761 },
1762 BufferSpec::VariableWidth,
1764 ],
1765 can_contain_null_mask: true,
1766 variadic: false,
1767 }
1768 }
1769
1770 pub fn new_view() -> Self {
1772 Self {
1773 buffers: vec![BufferSpec::FixedWidth {
1774 byte_width: mem::size_of::<u128>(),
1775 alignment: mem::align_of::<u128>(),
1776 }],
1777 can_contain_null_mask: true,
1778 variadic: true,
1779 }
1780 }
1781
1782 pub fn new_list_view<T>() -> Self {
1784 Self {
1785 buffers: vec![
1786 BufferSpec::FixedWidth {
1787 byte_width: mem::size_of::<T>(),
1788 alignment: mem::align_of::<T>(),
1789 },
1790 BufferSpec::FixedWidth {
1791 byte_width: mem::size_of::<T>(),
1792 alignment: mem::align_of::<T>(),
1793 },
1794 ],
1795 can_contain_null_mask: true,
1796 variadic: true,
1797 }
1798 }
1799}
1800
1801#[derive(Debug, PartialEq, Eq)]
1803pub enum BufferSpec {
1804 FixedWidth {
1815 byte_width: usize,
1817 alignment: usize,
1819 },
1820 VariableWidth,
1822 BitMap,
1828 #[allow(dead_code)]
1831 AlwaysNull,
1832}
1833
1834impl PartialEq for ArrayData {
1835 fn eq(&self, other: &Self) -> bool {
1836 equal::equal(self, other)
1837 }
1838}
1839
1840#[derive(Debug, Clone)]
1859#[doc(hidden)]
1860pub struct UnsafeFlag(bool);
1861
1862impl UnsafeFlag {
1863 #[inline]
1867 pub const fn new() -> Self {
1868 Self(false)
1869 }
1870
1871 #[inline]
1881 pub unsafe fn set(&mut self, val: bool) {
1882 self.0 = val;
1883 }
1884
1885 #[inline]
1887 pub fn get(&self) -> bool {
1888 self.0
1889 }
1890}
1891
1892impl Default for UnsafeFlag {
1894 fn default() -> Self {
1895 Self::new()
1896 }
1897}
1898
1899#[derive(Debug)]
1901pub struct ArrayDataBuilder {
1902 data_type: DataType,
1903 len: usize,
1904 null_count: Option<usize>,
1905 null_bit_buffer: Option<Buffer>,
1906 nulls: Option<NullBuffer>,
1907 offset: usize,
1908 buffers: Vec<Buffer>,
1909 child_data: Vec<ArrayData>,
1910 align_buffers: bool,
1914 skip_validation: UnsafeFlag,
1924}
1925
1926impl ArrayDataBuilder {
1927 #[inline]
1928 pub const fn new(data_type: DataType) -> Self {
1930 Self {
1931 data_type,
1932 len: 0,
1933 null_count: None,
1934 null_bit_buffer: None,
1935 nulls: None,
1936 offset: 0,
1937 buffers: vec![],
1938 child_data: vec![],
1939 align_buffers: false,
1940 skip_validation: UnsafeFlag::new(),
1941 }
1942 }
1943
1944 pub fn data_type(self, data_type: DataType) -> Self {
1946 Self { data_type, ..self }
1947 }
1948
1949 #[inline]
1950 #[allow(clippy::len_without_is_empty)]
1951 pub const fn len(mut self, n: usize) -> Self {
1953 self.len = n;
1954 self
1955 }
1956
1957 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1959 self.nulls = nulls;
1960 self.null_count = None;
1961 self.null_bit_buffer = None;
1962 self
1963 }
1964
1965 pub fn null_count(mut self, null_count: usize) -> Self {
1967 self.null_count = Some(null_count);
1968 self
1969 }
1970
1971 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1973 self.nulls = None;
1974 self.null_bit_buffer = buf;
1975 self
1976 }
1977
1978 #[inline]
1980 pub const fn offset(mut self, n: usize) -> Self {
1981 self.offset = n;
1982 self
1983 }
1984
1985 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1987 self.buffers = v;
1988 self
1989 }
1990
1991 pub fn add_buffer(mut self, b: Buffer) -> Self {
1993 self.buffers.push(b);
1994 self
1995 }
1996
1997 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1999 self.buffers.extend(bs);
2000 self
2001 }
2002
2003 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2005 self.child_data = v;
2006 self
2007 }
2008
2009 pub fn add_child_data(mut self, r: ArrayData) -> Self {
2011 self.child_data.push(r);
2012 self
2013 }
2014
2015 pub unsafe fn build_unchecked(self) -> ArrayData {
2031 unsafe { self.skip_validation(true) }.build().unwrap()
2032 }
2033
2034 pub fn build(self) -> Result<ArrayData, ArrowError> {
2043 let Self {
2044 data_type,
2045 len,
2046 null_count,
2047 null_bit_buffer,
2048 nulls,
2049 offset,
2050 buffers,
2051 child_data,
2052 align_buffers,
2053 skip_validation,
2054 } = self;
2055
2056 let nulls = nulls
2057 .or_else(|| {
2058 let buffer = null_bit_buffer?;
2059 let buffer = BooleanBuffer::new(buffer, offset, len);
2060 Some(match null_count {
2061 Some(n) => {
2062 unsafe { NullBuffer::new_unchecked(buffer, n) }
2064 }
2065 None => NullBuffer::new(buffer),
2066 })
2067 })
2068 .filter(|b| b.null_count() != 0);
2069
2070 let mut data = ArrayData {
2071 data_type,
2072 len,
2073 offset,
2074 buffers,
2075 child_data,
2076 nulls,
2077 };
2078
2079 if align_buffers {
2080 data.align_buffers();
2081 }
2082
2083 if !skip_validation.get() || cfg!(feature = "force_validate") {
2085 data.validate_data()?;
2086 }
2087 Ok(data)
2088 }
2089
2090 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2092 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2093 self.align_buffers(true).build()
2094 }
2095
2096 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2112 self.align_buffers = align_buffers;
2113 self
2114 }
2115
2116 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2130 unsafe {
2131 self.skip_validation.set(skip_validation);
2132 }
2133 self
2134 }
2135}
2136
2137impl From<ArrayData> for ArrayDataBuilder {
2138 fn from(d: ArrayData) -> Self {
2139 Self {
2140 data_type: d.data_type,
2141 len: d.len,
2142 offset: d.offset,
2143 buffers: d.buffers,
2144 child_data: d.child_data,
2145 nulls: d.nulls,
2146 null_bit_buffer: None,
2147 null_count: None,
2148 align_buffers: false,
2149 skip_validation: UnsafeFlag::new(),
2150 }
2151 }
2152}
2153
2154#[cfg(test)]
2155mod tests {
2156 use super::*;
2157 use arrow_schema::{Field, Fields};
2158
2159 fn make_i32_buffer(n: usize) -> Buffer {
2163 Buffer::from_slice_ref(vec![42i32; n])
2164 }
2165
2166 fn make_f32_buffer(n: usize) -> Buffer {
2168 Buffer::from_slice_ref(vec![42f32; n])
2169 }
2170
2171 #[test]
2172 fn test_builder() {
2173 let v = (0..25).collect::<Vec<i32>>();
2175 let b1 = Buffer::from_slice_ref(&v);
2176 let arr_data = ArrayData::builder(DataType::Int32)
2177 .len(20)
2178 .offset(5)
2179 .add_buffer(b1)
2180 .null_bit_buffer(Some(Buffer::from([
2181 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2182 ])))
2183 .build()
2184 .unwrap();
2185
2186 assert_eq!(20, arr_data.len());
2187 assert_eq!(10, arr_data.null_count());
2188 assert_eq!(5, arr_data.offset());
2189 assert_eq!(1, arr_data.buffers().len());
2190 assert_eq!(
2191 Buffer::from_slice_ref(&v).as_slice(),
2192 arr_data.buffers()[0].as_slice()
2193 );
2194 }
2195
2196 #[test]
2197 fn test_builder_with_child_data() {
2198 let child_arr_data = ArrayData::try_new(
2199 DataType::Int32,
2200 5,
2201 None,
2202 0,
2203 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2204 vec![],
2205 )
2206 .unwrap();
2207
2208 let field = Arc::new(Field::new("x", DataType::Int32, true));
2209 let data_type = DataType::Struct(vec![field].into());
2210
2211 let arr_data = ArrayData::builder(data_type)
2212 .len(5)
2213 .offset(0)
2214 .add_child_data(child_arr_data.clone())
2215 .build()
2216 .unwrap();
2217
2218 assert_eq!(5, arr_data.len());
2219 assert_eq!(1, arr_data.child_data().len());
2220 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2221 }
2222
2223 #[test]
2224 fn test_null_count() {
2225 let mut bit_v: [u8; 2] = [0; 2];
2226 bit_util::set_bit(&mut bit_v, 0);
2227 bit_util::set_bit(&mut bit_v, 3);
2228 bit_util::set_bit(&mut bit_v, 10);
2229 let arr_data = ArrayData::builder(DataType::Int32)
2230 .len(16)
2231 .add_buffer(make_i32_buffer(16))
2232 .null_bit_buffer(Some(Buffer::from(bit_v)))
2233 .build()
2234 .unwrap();
2235 assert_eq!(13, arr_data.null_count());
2236
2237 let mut bit_v: [u8; 2] = [0; 2];
2239 bit_util::set_bit(&mut bit_v, 0);
2240 bit_util::set_bit(&mut bit_v, 3);
2241 bit_util::set_bit(&mut bit_v, 10);
2242 let arr_data = ArrayData::builder(DataType::Int32)
2243 .len(12)
2244 .offset(2)
2245 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2247 .build()
2248 .unwrap();
2249 assert_eq!(10, arr_data.null_count());
2250 }
2251
2252 #[test]
2253 fn test_null_buffer_ref() {
2254 let mut bit_v: [u8; 2] = [0; 2];
2255 bit_util::set_bit(&mut bit_v, 0);
2256 bit_util::set_bit(&mut bit_v, 3);
2257 bit_util::set_bit(&mut bit_v, 10);
2258 let arr_data = ArrayData::builder(DataType::Int32)
2259 .len(16)
2260 .add_buffer(make_i32_buffer(16))
2261 .null_bit_buffer(Some(Buffer::from(bit_v)))
2262 .build()
2263 .unwrap();
2264 assert!(arr_data.nulls().is_some());
2265 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2266 }
2267
2268 #[test]
2269 fn test_slice() {
2270 let mut bit_v: [u8; 2] = [0; 2];
2271 bit_util::set_bit(&mut bit_v, 0);
2272 bit_util::set_bit(&mut bit_v, 3);
2273 bit_util::set_bit(&mut bit_v, 10);
2274 let data = ArrayData::builder(DataType::Int32)
2275 .len(16)
2276 .add_buffer(make_i32_buffer(16))
2277 .null_bit_buffer(Some(Buffer::from(bit_v)))
2278 .build()
2279 .unwrap();
2280 let new_data = data.slice(1, 15);
2281 assert_eq!(data.len() - 1, new_data.len());
2282 assert_eq!(1, new_data.offset());
2283 assert_eq!(data.null_count(), new_data.null_count());
2284
2285 let new_data = new_data.slice(1, 14);
2287 assert_eq!(data.len() - 2, new_data.len());
2288 assert_eq!(2, new_data.offset());
2289 assert_eq!(data.null_count() - 1, new_data.null_count());
2290 }
2291
2292 #[test]
2293 fn test_equality() {
2294 let int_data = ArrayData::builder(DataType::Int32)
2295 .len(1)
2296 .add_buffer(make_i32_buffer(1))
2297 .build()
2298 .unwrap();
2299
2300 let float_data = ArrayData::builder(DataType::Float32)
2301 .len(1)
2302 .add_buffer(make_f32_buffer(1))
2303 .build()
2304 .unwrap();
2305 assert_ne!(int_data, float_data);
2306 assert!(!int_data.ptr_eq(&float_data));
2307 assert!(int_data.ptr_eq(&int_data));
2308
2309 #[allow(clippy::redundant_clone)]
2310 let int_data_clone = int_data.clone();
2311 assert_eq!(int_data, int_data_clone);
2312 assert!(int_data.ptr_eq(&int_data_clone));
2313 assert!(int_data_clone.ptr_eq(&int_data));
2314
2315 let int_data_slice = int_data_clone.slice(1, 0);
2316 assert!(int_data_slice.ptr_eq(&int_data_slice));
2317 assert!(!int_data.ptr_eq(&int_data_slice));
2318 assert!(!int_data_slice.ptr_eq(&int_data));
2319
2320 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2321 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2322 let string_data = ArrayData::try_new(
2323 DataType::Utf8,
2324 3,
2325 Some(Buffer::from_iter(vec![true, false, true])),
2326 0,
2327 vec![offsets_buffer, data_buffer],
2328 vec![],
2329 )
2330 .unwrap();
2331
2332 assert_ne!(float_data, string_data);
2333 assert!(!float_data.ptr_eq(&string_data));
2334
2335 assert!(string_data.ptr_eq(&string_data));
2336
2337 #[allow(clippy::redundant_clone)]
2338 let string_data_cloned = string_data.clone();
2339 assert!(string_data_cloned.ptr_eq(&string_data));
2340 assert!(string_data.ptr_eq(&string_data_cloned));
2341
2342 let string_data_slice = string_data.slice(1, 2);
2343 assert!(string_data_slice.ptr_eq(&string_data_slice));
2344 assert!(!string_data_slice.ptr_eq(&string_data))
2345 }
2346
2347 #[test]
2348 fn test_slice_memory_size() {
2349 let mut bit_v: [u8; 2] = [0; 2];
2350 bit_util::set_bit(&mut bit_v, 0);
2351 bit_util::set_bit(&mut bit_v, 3);
2352 bit_util::set_bit(&mut bit_v, 10);
2353 let data = ArrayData::builder(DataType::Int32)
2354 .len(16)
2355 .add_buffer(make_i32_buffer(16))
2356 .null_bit_buffer(Some(Buffer::from(bit_v)))
2357 .build()
2358 .unwrap();
2359 let new_data = data.slice(1, 14);
2360 assert_eq!(
2361 data.get_slice_memory_size().unwrap() - 8,
2362 new_data.get_slice_memory_size().unwrap()
2363 );
2364 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2365 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2366 let string_data = ArrayData::try_new(
2367 DataType::Utf8,
2368 3,
2369 Some(Buffer::from_iter(vec![true, false, true])),
2370 0,
2371 vec![offsets_buffer, data_buffer],
2372 vec![],
2373 )
2374 .unwrap();
2375 let string_data_slice = string_data.slice(1, 2);
2376 assert_eq!(
2378 string_data.get_slice_memory_size().unwrap() - 6,
2379 string_data_slice.get_slice_memory_size().unwrap()
2380 );
2381 }
2382
2383 #[test]
2384 fn test_count_nulls() {
2385 let buffer = Buffer::from([0b00010110, 0b10011111]);
2386 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2387 let count = count_nulls(Some(&buffer), 0, 16);
2388 assert_eq!(count, 7);
2389
2390 let count = count_nulls(Some(&buffer), 4, 8);
2391 assert_eq!(count, 3);
2392 }
2393
2394 #[test]
2395 fn test_contains_nulls() {
2396 let buffer: Buffer =
2397 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2398 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2399 assert!(contains_nulls(Some(&buffer), 0, 6));
2400 assert!(contains_nulls(Some(&buffer), 0, 3));
2401 assert!(!contains_nulls(Some(&buffer), 3, 2));
2402 assert!(!contains_nulls(Some(&buffer), 0, 0));
2403 }
2404
2405 #[test]
2406 fn test_alignment() {
2407 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2408 let sliced = buffer.slice(1);
2409
2410 let mut data = ArrayData {
2411 data_type: DataType::Int32,
2412 len: 0,
2413 offset: 0,
2414 buffers: vec![buffer],
2415 child_data: vec![],
2416 nulls: None,
2417 };
2418 data.validate_full().unwrap();
2419
2420 data.buffers[0] = sliced;
2422 let err = data.validate().unwrap_err();
2423
2424 assert_eq!(
2425 err.to_string(),
2426 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2427 );
2428
2429 data.align_buffers();
2430 data.validate_full().unwrap();
2431 }
2432
2433 #[test]
2434 fn test_alignment_struct() {
2435 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2436 let sliced = buffer.slice(1);
2437
2438 let child_data = ArrayData {
2439 data_type: DataType::Int32,
2440 len: 0,
2441 offset: 0,
2442 buffers: vec![buffer],
2443 child_data: vec![],
2444 nulls: None,
2445 };
2446
2447 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2448 let mut data = ArrayData {
2449 data_type: schema,
2450 len: 0,
2451 offset: 0,
2452 buffers: vec![],
2453 child_data: vec![child_data],
2454 nulls: None,
2455 };
2456 data.validate_full().unwrap();
2457
2458 data.child_data[0].buffers[0] = sliced;
2460 let err = data.validate().unwrap_err();
2461
2462 assert_eq!(
2463 err.to_string(),
2464 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2465 );
2466
2467 data.align_buffers();
2468 data.validate_full().unwrap();
2469 }
2470
2471 #[test]
2472 fn test_null_view_types() {
2473 let array_len = 32;
2474 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2475 assert_eq!(array.len(), array_len);
2476 for i in 0..array.len() {
2477 assert!(array.is_null(i));
2478 }
2479
2480 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2481 assert_eq!(array.len(), array_len);
2482 for i in 0..array.len() {
2483 assert!(array.is_null(i));
2484 }
2485
2486 let array = ArrayData::new_null(
2487 &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2488 array_len,
2489 );
2490 assert_eq!(array.len(), array_len);
2491 for i in 0..array.len() {
2492 assert!(array.is_null(i));
2493 }
2494
2495 let array = ArrayData::new_null(
2496 &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2497 array_len,
2498 );
2499 assert_eq!(array.len(), array_len);
2500 for i in 0..array.len() {
2501 assert!(array.is_null(i));
2502 }
2503 }
2504}