1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal32(_, _)
87 | DataType::Decimal64(_, _)
88 | DataType::Decimal128(_, _)
89 | DataType::Decimal256(_, _)
90 | DataType::Date32
91 | DataType::Time32(_)
92 | DataType::Date64
93 | DataType::Time64(_)
94 | DataType::Duration(_)
95 | DataType::Timestamp(_, _)
96 | DataType::Interval(_) => [
97 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98 empty_buffer,
99 ],
100 DataType::Utf8 | DataType::Binary => {
101 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102 buffer.push(0i32);
104 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105 }
106 DataType::LargeUtf8 | DataType::LargeBinary => {
107 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108 buffer.push(0i64);
110 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111 }
112 DataType::BinaryView | DataType::Utf8View => [
113 MutableBuffer::new(capacity * mem::size_of::<u128>()),
114 empty_buffer,
115 ],
116 DataType::List(_) | DataType::Map(_, _) => {
117 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119 buffer.push(0i32);
120 [buffer, empty_buffer]
121 }
122 DataType::ListView(_) => [
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 ],
126 DataType::LargeList(_) => {
127 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129 buffer.push(0i64);
130 [buffer, empty_buffer]
131 }
132 DataType::LargeListView(_) => [
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 ],
136 DataType::FixedSizeBinary(size) => {
137 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138 }
139 DataType::Dictionary(k, _) => [
140 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141 empty_buffer,
142 ],
143 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144 [empty_buffer, MutableBuffer::new(0)]
145 }
146 DataType::Union(_, mode) => {
147 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148 match mode {
149 UnionMode::Sparse => [type_ids, empty_buffer],
150 UnionMode::Dense => {
151 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152 [type_ids, offsets]
153 }
154 }
155 }
156 }
157}
158
159#[derive(Debug, Clone)]
205pub struct ArrayData {
206 data_type: DataType,
208
209 len: usize,
211
212 offset: usize,
217
218 buffers: Vec<Buffer>,
231
232 child_data: Vec<ArrayData>,
242
243 nulls: Option<NullBuffer>,
251}
252
253pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257 pub unsafe fn new_unchecked(
274 data_type: DataType,
275 len: usize,
276 null_count: Option<usize>,
277 null_bit_buffer: Option<Buffer>,
278 offset: usize,
279 buffers: Vec<Buffer>,
280 child_data: Vec<ArrayData>,
281 ) -> Self {
282 let mut skip_validation = UnsafeFlag::new();
283 skip_validation.set(true);
285
286 ArrayDataBuilder {
287 data_type,
288 len,
289 null_count,
290 null_bit_buffer,
291 nulls: None,
292 offset,
293 buffers,
294 child_data,
295 align_buffers: false,
296 skip_validation,
297 }
298 .build()
299 .unwrap()
300 }
301
302 pub fn try_new(
313 data_type: DataType,
314 len: usize,
315 null_bit_buffer: Option<Buffer>,
316 offset: usize,
317 buffers: Vec<Buffer>,
318 child_data: Vec<ArrayData>,
319 ) -> Result<Self, ArrowError> {
320 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324 let needed_len = bit_util::ceil(len + offset, 8);
325 if null_bit_buffer.len() < needed_len {
326 return Err(ArrowError::InvalidArgumentError(format!(
327 "null_bit_buffer size too small. got {} needed {}",
328 null_bit_buffer.len(),
329 needed_len
330 )));
331 }
332 }
333 let new_self = unsafe {
335 Self::new_unchecked(
336 data_type,
337 len,
338 None,
339 null_bit_buffer,
340 offset,
341 buffers,
342 child_data,
343 )
344 };
345
346 new_self.validate_data()?;
351 Ok(new_self)
352 }
353
354 #[inline]
356 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357 ArrayDataBuilder::new(data_type)
358 }
359
360 #[inline]
362 pub const fn data_type(&self) -> &DataType {
363 &self.data_type
364 }
365
366 pub fn buffers(&self) -> &[Buffer] {
368 &self.buffers
369 }
370
371 pub fn child_data(&self) -> &[ArrayData] {
374 &self.child_data[..]
375 }
376
377 #[inline]
379 pub fn is_null(&self, i: usize) -> bool {
380 match &self.nulls {
381 Some(v) => v.is_null(i),
382 None => false,
383 }
384 }
385
386 #[inline]
390 pub fn nulls(&self) -> Option<&NullBuffer> {
391 self.nulls.as_ref()
392 }
393
394 #[inline]
396 pub fn is_valid(&self, i: usize) -> bool {
397 !self.is_null(i)
398 }
399
400 #[inline]
402 pub const fn len(&self) -> usize {
403 self.len
404 }
405
406 #[inline]
408 pub const fn is_empty(&self) -> bool {
409 self.len == 0
410 }
411
412 #[inline]
414 pub const fn offset(&self) -> usize {
415 self.offset
416 }
417
418 #[inline]
420 pub fn null_count(&self) -> usize {
421 self.nulls
422 .as_ref()
423 .map(|x| x.null_count())
424 .unwrap_or_default()
425 }
426
427 pub fn get_buffer_memory_size(&self) -> usize {
439 let mut size = 0;
440 for buffer in &self.buffers {
441 size += buffer.capacity();
442 }
443 if let Some(bitmap) = &self.nulls {
444 size += bitmap.buffer().capacity()
445 }
446 for child in &self.child_data {
447 size += child.get_buffer_memory_size();
448 }
449 size
450 }
451
452 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465 let mut result: usize = 0;
466 let layout = layout(&self.data_type);
467
468 for spec in layout.buffers.iter() {
469 match spec {
470 BufferSpec::FixedWidth { byte_width, .. } => {
471 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472 ArrowError::ComputeError(
473 "Integer overflow computing buffer size".to_string(),
474 )
475 })?;
476 result += buffer_size;
477 }
478 BufferSpec::VariableWidth => {
479 let buffer_len: usize;
480 match self.data_type {
481 DataType::Utf8 | DataType::Binary => {
482 let offsets = self.typed_offsets::<i32>()?;
483 buffer_len = (offsets[self.len] - offsets[0] ) as usize;
484 }
485 DataType::LargeUtf8 | DataType::LargeBinary => {
486 let offsets = self.typed_offsets::<i64>()?;
487 buffer_len = (offsets[self.len] - offsets[0]) as usize;
488 }
489 _ => {
490 return Err(ArrowError::NotYetImplemented(format!(
491 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
492 self.data_type
493 )))
494 }
495 };
496 result += buffer_len;
497 }
498 BufferSpec::BitMap => {
499 let buffer_size = bit_util::ceil(self.len, 8);
500 result += buffer_size;
501 }
502 BufferSpec::AlwaysNull => {
503 }
505 }
506 }
507
508 if self.nulls().is_some() {
509 result += bit_util::ceil(self.len, 8);
510 }
511
512 for child in &self.child_data {
513 result += child.get_slice_memory_size()?;
514 }
515 Ok(result)
516 }
517
518 pub fn get_array_memory_size(&self) -> usize {
527 let mut size = mem::size_of_val(self);
528
529 for buffer in &self.buffers {
531 size += mem::size_of::<Buffer>();
532 size += buffer.capacity();
533 }
534 if let Some(nulls) = &self.nulls {
535 size += nulls.buffer().capacity();
536 }
537 for child in &self.child_data {
538 size += child.get_array_memory_size();
539 }
540
541 size
542 }
543
544 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
552 assert!((offset + length) <= self.len());
553
554 if let DataType::Struct(_) = self.data_type() {
555 let new_offset = self.offset + offset;
557 let new_data = ArrayData {
558 data_type: self.data_type().clone(),
559 len: length,
560 offset: new_offset,
561 buffers: self.buffers.clone(),
562 child_data: self
564 .child_data()
565 .iter()
566 .map(|data| data.slice(offset, length))
567 .collect(),
568 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
569 };
570
571 new_data
572 } else {
573 let mut new_data = self.clone();
574
575 new_data.len = length;
576 new_data.offset = offset + self.offset;
577 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
578
579 new_data
580 }
581 }
582
583 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
590 &self.buffers()[buffer].typed_data()[self.offset..]
591 }
592
593 pub fn new_null(data_type: &DataType, len: usize) -> Self {
595 let bit_len = bit_util::ceil(len, 8);
596 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
597
598 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
599 Some(width) => (vec![zeroed(width * len)], vec![], true),
600 None => match data_type {
601 DataType::Null => (vec![], vec![], false),
602 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
603 DataType::Binary | DataType::Utf8 => {
604 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
605 }
606 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
607 DataType::LargeBinary | DataType::LargeUtf8 => {
608 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
609 }
610 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
611 DataType::List(f) | DataType::Map(f, _) => (
612 vec![zeroed((len + 1) * 4)],
613 vec![ArrayData::new_empty(f.data_type())],
614 true,
615 ),
616 DataType::LargeList(f) => (
617 vec![zeroed((len + 1) * 8)],
618 vec![ArrayData::new_empty(f.data_type())],
619 true,
620 ),
621 DataType::FixedSizeList(f, list_len) => (
622 vec![],
623 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
624 true,
625 ),
626 DataType::Struct(fields) => (
627 vec![],
628 fields
629 .iter()
630 .map(|f| Self::new_null(f.data_type(), len))
631 .collect(),
632 true,
633 ),
634 DataType::Dictionary(k, v) => (
635 vec![zeroed(k.primitive_width().unwrap() * len)],
636 vec![ArrayData::new_empty(v.as_ref())],
637 true,
638 ),
639 DataType::Union(f, mode) => {
640 let (id, _) = f.iter().next().unwrap();
641 let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
642 let buffers = match mode {
643 UnionMode::Sparse => vec![ids],
644 UnionMode::Dense => {
645 let end_offset = i32::from_usize(len).unwrap();
646 vec![ids, Buffer::from_iter(0_i32..end_offset)]
647 }
648 };
649
650 let children = f
651 .iter()
652 .enumerate()
653 .map(|(idx, (_, f))| {
654 if idx == 0 || *mode == UnionMode::Sparse {
655 Self::new_null(f.data_type(), len)
656 } else {
657 Self::new_empty(f.data_type())
658 }
659 })
660 .collect();
661
662 (buffers, children, false)
663 }
664 DataType::RunEndEncoded(r, v) => {
665 let runs = match r.data_type() {
666 DataType::Int16 => {
667 let i = i16::from_usize(len).expect("run overflow");
668 Buffer::from_slice_ref([i])
669 }
670 DataType::Int32 => {
671 let i = i32::from_usize(len).expect("run overflow");
672 Buffer::from_slice_ref([i])
673 }
674 DataType::Int64 => {
675 let i = i64::from_usize(len).expect("run overflow");
676 Buffer::from_slice_ref([i])
677 }
678 dt => unreachable!("Invalid run ends data type {dt}"),
679 };
680
681 let builder = ArrayData::builder(r.data_type().clone())
682 .len(1)
683 .buffers(vec![runs]);
684
685 let runs = unsafe { builder.build_unchecked() };
688 (
689 vec![],
690 vec![runs, ArrayData::new_null(v.data_type(), 1)],
691 false,
692 )
693 }
694 d => unreachable!("{d}"),
695 },
696 };
697
698 let mut builder = ArrayDataBuilder::new(data_type.clone())
699 .len(len)
700 .buffers(buffers)
701 .child_data(child_data);
702
703 if has_nulls {
704 builder = builder.nulls(Some(NullBuffer::new_null(len)))
705 }
706
707 unsafe { builder.build_unchecked() }
710 }
711
712 pub fn new_empty(data_type: &DataType) -> Self {
714 Self::new_null(data_type, 0)
715 }
716
717 pub fn align_buffers(&mut self) {
726 let layout = layout(&self.data_type);
727 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
728 if let BufferSpec::FixedWidth { alignment, .. } = spec {
729 if buffer.as_ptr().align_offset(*alignment) != 0 {
730 *buffer = Buffer::from_slice_ref(buffer.as_ref());
731 }
732 }
733 }
734 for data in self.child_data.iter_mut() {
736 data.align_buffers()
737 }
738 }
739
740 pub fn validate(&self) -> Result<(), ArrowError> {
751 let len_plus_offset = self.len + self.offset;
753
754 let layout = layout(&self.data_type);
756
757 if !layout.can_contain_null_mask && self.nulls.is_some() {
758 return Err(ArrowError::InvalidArgumentError(format!(
759 "Arrays of type {:?} cannot contain a null bitmask",
760 self.data_type,
761 )));
762 }
763
764 if self.buffers.len() < layout.buffers.len()
766 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
767 {
768 return Err(ArrowError::InvalidArgumentError(format!(
769 "Expected {} buffers in array of type {:?}, got {}",
770 layout.buffers.len(),
771 self.data_type,
772 self.buffers.len(),
773 )));
774 }
775
776 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
777 match spec {
778 BufferSpec::FixedWidth {
779 byte_width,
780 alignment,
781 } => {
782 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
783
784 if buffer.len() < min_buffer_size {
785 return Err(ArrowError::InvalidArgumentError(format!(
786 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
787 min_buffer_size, i, self.data_type, buffer.len()
788 )));
789 }
790
791 let align_offset = buffer.as_ptr().align_offset(*alignment);
792 if align_offset != 0 {
793 return Err(ArrowError::InvalidArgumentError(format!(
794 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795 self.data_type, align_offset.min(alignment - align_offset)
796 )));
797 }
798 }
799 BufferSpec::VariableWidth => {
800 }
804 BufferSpec::BitMap => {
805 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
806 if buffer.len() < min_buffer_size {
807 return Err(ArrowError::InvalidArgumentError(format!(
808 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
809 min_buffer_size, i, self.data_type, buffer.len()
810 )));
811 }
812 }
813 BufferSpec::AlwaysNull => {
814 }
816 }
817 }
818
819 if let Some(nulls) = self.nulls() {
821 if nulls.null_count() > self.len {
822 return Err(ArrowError::InvalidArgumentError(format!(
823 "null_count {} for an array exceeds length of {} elements",
824 nulls.null_count(),
825 self.len
826 )));
827 }
828
829 let actual_len = nulls.validity().len();
830 let needed_len = bit_util::ceil(len_plus_offset, 8);
831 if actual_len < needed_len {
832 return Err(ArrowError::InvalidArgumentError(format!(
833 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
834 )));
835 }
836
837 if nulls.len() != self.len {
838 return Err(ArrowError::InvalidArgumentError(format!(
839 "null buffer incorrect size. got {} expected {}",
840 nulls.len(),
841 self.len
842 )));
843 }
844 }
845
846 self.validate_child_data()?;
847
848 match &self.data_type {
850 DataType::Utf8 | DataType::Binary => {
851 self.validate_offsets::<i32>(self.buffers[1].len())?;
852 }
853 DataType::LargeUtf8 | DataType::LargeBinary => {
854 self.validate_offsets::<i64>(self.buffers[1].len())?;
855 }
856 DataType::Dictionary(key_type, _value_type) => {
857 if !DataType::is_dictionary_key_type(key_type) {
859 return Err(ArrowError::InvalidArgumentError(format!(
860 "Dictionary key type must be integer, but was {key_type}"
861 )));
862 }
863 }
864 DataType::RunEndEncoded(run_ends_type, _) => {
865 if run_ends_type.is_nullable() {
866 return Err(ArrowError::InvalidArgumentError(
867 "The nullable should be set to false for the field defining run_ends array.".to_string()
868 ));
869 }
870 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
871 return Err(ArrowError::InvalidArgumentError(format!(
872 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
873 run_ends_type.data_type()
874 )));
875 }
876 }
877 _ => {}
878 };
879
880 Ok(())
881 }
882
883 fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
890 if self.len == 0 && self.buffers[0].is_empty() {
892 return Ok(&[]);
893 }
894
895 self.typed_buffer(0, self.len + 1)
896 }
897
898 fn typed_buffer<T: ArrowNativeType + num::Num>(
900 &self,
901 idx: usize,
902 len: usize,
903 ) -> Result<&[T], ArrowError> {
904 let buffer = &self.buffers[idx];
905
906 let required_len = (len + self.offset) * mem::size_of::<T>();
907
908 if buffer.len() < required_len {
909 return Err(ArrowError::InvalidArgumentError(format!(
910 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
911 idx,
912 self.data_type,
913 required_len,
914 buffer.len()
915 )));
916 }
917
918 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
919 }
920
921 fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
924 &self,
925 values_length: usize,
926 ) -> Result<(), ArrowError> {
927 let offsets = self.typed_offsets::<T>()?;
929 if offsets.is_empty() {
930 return Ok(());
931 }
932
933 let first_offset = offsets[0].to_usize().ok_or_else(|| {
934 ArrowError::InvalidArgumentError(format!(
935 "Error converting offset[0] ({}) to usize for {}",
936 offsets[0], self.data_type
937 ))
938 })?;
939
940 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
941 ArrowError::InvalidArgumentError(format!(
942 "Error converting offset[{}] ({}) to usize for {}",
943 self.len, offsets[self.len], self.data_type
944 ))
945 })?;
946
947 if first_offset > values_length {
948 return Err(ArrowError::InvalidArgumentError(format!(
949 "First offset {} of {} is larger than values length {}",
950 first_offset, self.data_type, values_length,
951 )));
952 }
953
954 if last_offset > values_length {
955 return Err(ArrowError::InvalidArgumentError(format!(
956 "Last offset {} of {} is larger than values length {}",
957 last_offset, self.data_type, values_length,
958 )));
959 }
960
961 if first_offset > last_offset {
962 return Err(ArrowError::InvalidArgumentError(format!(
963 "First offset {} in {} is smaller than last offset {}",
964 first_offset, self.data_type, last_offset,
965 )));
966 }
967
968 Ok(())
969 }
970
971 fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
974 &self,
975 values_length: usize,
976 ) -> Result<(), ArrowError> {
977 let offsets: &[T] = self.typed_buffer(0, self.len)?;
978 let sizes: &[T] = self.typed_buffer(1, self.len)?;
979 for i in 0..values_length {
980 let size = sizes[i].to_usize().ok_or_else(|| {
981 ArrowError::InvalidArgumentError(format!(
982 "Error converting size[{}] ({}) to usize for {}",
983 i, sizes[i], self.data_type
984 ))
985 })?;
986 let offset = offsets[i].to_usize().ok_or_else(|| {
987 ArrowError::InvalidArgumentError(format!(
988 "Error converting offset[{}] ({}) to usize for {}",
989 i, offsets[i], self.data_type
990 ))
991 })?;
992 if size
993 .checked_add(offset)
994 .expect("Offset and size have exceeded the usize boundary")
995 > values_length
996 {
997 return Err(ArrowError::InvalidArgumentError(format!(
998 "Size {} at index {} is larger than the remaining values for {}",
999 size, i, self.data_type
1000 )));
1001 }
1002 }
1003 Ok(())
1004 }
1005
1006 fn validate_child_data(&self) -> Result<(), ArrowError> {
1008 match &self.data_type {
1009 DataType::List(field) | DataType::Map(field, _) => {
1010 let values_data = self.get_single_valid_child_data(field.data_type())?;
1011 self.validate_offsets::<i32>(values_data.len)?;
1012 Ok(())
1013 }
1014 DataType::LargeList(field) => {
1015 let values_data = self.get_single_valid_child_data(field.data_type())?;
1016 self.validate_offsets::<i64>(values_data.len)?;
1017 Ok(())
1018 }
1019 DataType::ListView(field) => {
1020 let values_data = self.get_single_valid_child_data(field.data_type())?;
1021 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1022 Ok(())
1023 }
1024 DataType::LargeListView(field) => {
1025 let values_data = self.get_single_valid_child_data(field.data_type())?;
1026 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1027 Ok(())
1028 }
1029 DataType::FixedSizeList(field, list_size) => {
1030 let values_data = self.get_single_valid_child_data(field.data_type())?;
1031
1032 let list_size: usize = (*list_size).try_into().map_err(|_| {
1033 ArrowError::InvalidArgumentError(format!(
1034 "{} has a negative list_size {}",
1035 self.data_type, list_size
1036 ))
1037 })?;
1038
1039 let expected_values_len = self.len
1040 .checked_mul(list_size)
1041 .expect("integer overflow computing expected number of expected values in FixedListSize");
1042
1043 if values_data.len < expected_values_len {
1044 return Err(ArrowError::InvalidArgumentError(format!(
1045 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1046 values_data.len, self.len, list_size, self.data_type
1047 )));
1048 }
1049
1050 Ok(())
1051 }
1052 DataType::Struct(fields) => {
1053 self.validate_num_child_data(fields.len())?;
1054 for (i, field) in fields.iter().enumerate() {
1055 let field_data = self.get_valid_child_data(i, field.data_type())?;
1056
1057 if field_data.len < self.len {
1059 return Err(ArrowError::InvalidArgumentError(format!(
1060 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1061 self.data_type, i, field.name(), field_data.len, self.len
1062 )));
1063 }
1064 }
1065 Ok(())
1066 }
1067 DataType::RunEndEncoded(run_ends_field, values_field) => {
1068 self.validate_num_child_data(2)?;
1069 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1070 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1071 if run_ends_data.len != values_data.len {
1072 return Err(ArrowError::InvalidArgumentError(format!(
1073 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1074 run_ends_data.len, values_data.len
1075 )));
1076 }
1077 if run_ends_data.nulls.is_some() {
1078 return Err(ArrowError::InvalidArgumentError(
1079 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1080 ));
1081 }
1082 Ok(())
1083 }
1084 DataType::Union(fields, mode) => {
1085 self.validate_num_child_data(fields.len())?;
1086
1087 for (i, (_, field)) in fields.iter().enumerate() {
1088 let field_data = self.get_valid_child_data(i, field.data_type())?;
1089
1090 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1091 return Err(ArrowError::InvalidArgumentError(format!(
1092 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1093 i, field_data.len, self.len + self.offset
1094 )));
1095 }
1096 }
1097 Ok(())
1098 }
1099 DataType::Dictionary(_key_type, value_type) => {
1100 self.get_single_valid_child_data(value_type)?;
1101 Ok(())
1102 }
1103 _ => {
1104 if !self.child_data.is_empty() {
1106 return Err(ArrowError::InvalidArgumentError(format!(
1107 "Expected no child arrays for type {} but got {}",
1108 self.data_type,
1109 self.child_data.len()
1110 )));
1111 }
1112 Ok(())
1113 }
1114 }
1115 }
1116
1117 fn get_single_valid_child_data(
1121 &self,
1122 expected_type: &DataType,
1123 ) -> Result<&ArrayData, ArrowError> {
1124 self.validate_num_child_data(1)?;
1125 self.get_valid_child_data(0, expected_type)
1126 }
1127
1128 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1130 if self.child_data.len() != expected_len {
1131 Err(ArrowError::InvalidArgumentError(format!(
1132 "Value data for {} should contain {} child data array(s), had {}",
1133 self.data_type,
1134 expected_len,
1135 self.child_data.len()
1136 )))
1137 } else {
1138 Ok(())
1139 }
1140 }
1141
1142 fn get_valid_child_data(
1145 &self,
1146 i: usize,
1147 expected_type: &DataType,
1148 ) -> Result<&ArrayData, ArrowError> {
1149 let values_data = self.child_data.get(i).ok_or_else(|| {
1150 ArrowError::InvalidArgumentError(format!(
1151 "{} did not have enough child arrays. Expected at least {} but had only {}",
1152 self.data_type,
1153 i + 1,
1154 self.child_data.len()
1155 ))
1156 })?;
1157
1158 if expected_type != &values_data.data_type {
1159 return Err(ArrowError::InvalidArgumentError(format!(
1160 "Child type mismatch for {}. Expected {} but child data had {}",
1161 self.data_type, expected_type, values_data.data_type
1162 )));
1163 }
1164
1165 values_data.validate()?;
1166 Ok(values_data)
1167 }
1168
1169 pub fn validate_data(&self) -> Result<(), ArrowError> {
1185 self.validate()?;
1186
1187 self.validate_nulls()?;
1188 self.validate_values()?;
1189 Ok(())
1190 }
1191
1192 pub fn validate_full(&self) -> Result<(), ArrowError> {
1197 self.validate_data()?;
1198 self.child_data
1200 .iter()
1201 .enumerate()
1202 .try_for_each(|(i, child_data)| {
1203 child_data.validate_full().map_err(|e| {
1204 ArrowError::InvalidArgumentError(format!(
1205 "{} child #{} invalid: {}",
1206 self.data_type, i, e
1207 ))
1208 })
1209 })?;
1210 Ok(())
1211 }
1212
1213 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1223 if let Some(nulls) = &self.nulls {
1224 let actual = nulls.len() - nulls.inner().count_set_bits();
1225 if actual != nulls.null_count() {
1226 return Err(ArrowError::InvalidArgumentError(format!(
1227 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1228 nulls.null_count(),
1229 actual
1230 )));
1231 }
1232 }
1233
1234 match &self.data_type {
1239 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1240 if !f.is_nullable() {
1241 self.validate_non_nullable(None, &self.child_data[0])?
1242 }
1243 }
1244 DataType::FixedSizeList(field, len) => {
1245 let child = &self.child_data[0];
1246 if !field.is_nullable() {
1247 match &self.nulls {
1248 Some(nulls) => {
1249 let element_len = *len as usize;
1250 let expanded = nulls.expand(element_len);
1251 self.validate_non_nullable(Some(&expanded), child)?;
1252 }
1253 None => self.validate_non_nullable(None, child)?,
1254 }
1255 }
1256 }
1257 DataType::Struct(fields) => {
1258 for (field, child) in fields.iter().zip(&self.child_data) {
1259 if !field.is_nullable() {
1260 self.validate_non_nullable(self.nulls(), child)?
1261 }
1262 }
1263 }
1264 _ => {}
1265 }
1266
1267 Ok(())
1268 }
1269
1270 fn validate_non_nullable(
1272 &self,
1273 mask: Option<&NullBuffer>,
1274 child: &ArrayData,
1275 ) -> Result<(), ArrowError> {
1276 let mask = match mask {
1277 Some(mask) => mask,
1278 None => {
1279 return match child.null_count() {
1280 0 => Ok(()),
1281 _ => Err(ArrowError::InvalidArgumentError(format!(
1282 "non-nullable child of type {} contains nulls not present in parent {}",
1283 child.data_type, self.data_type
1284 ))),
1285 }
1286 }
1287 };
1288
1289 match child.nulls() {
1290 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1291 "non-nullable child of type {} contains nulls not present in parent",
1292 child.data_type
1293 ))),
1294 _ => Ok(()),
1295 }
1296 }
1297
1298 pub fn validate_values(&self) -> Result<(), ArrowError> {
1304 match &self.data_type {
1305 DataType::Utf8 => self.validate_utf8::<i32>(),
1306 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1307 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1308 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1309 DataType::BinaryView => {
1310 let views = self.typed_buffer::<u128>(0, self.len)?;
1311 validate_binary_view(views, &self.buffers[1..])
1312 }
1313 DataType::Utf8View => {
1314 let views = self.typed_buffer::<u128>(0, self.len)?;
1315 validate_string_view(views, &self.buffers[1..])
1316 }
1317 DataType::List(_) | DataType::Map(_, _) => {
1318 let child = &self.child_data[0];
1319 self.validate_offsets_full::<i32>(child.len)
1320 }
1321 DataType::LargeList(_) => {
1322 let child = &self.child_data[0];
1323 self.validate_offsets_full::<i64>(child.len)
1324 }
1325 DataType::Union(_, _) => {
1326 Ok(())
1332 }
1333 DataType::Dictionary(key_type, _value_type) => {
1334 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1335 let max_value = dictionary_length - 1;
1336 match key_type.as_ref() {
1337 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1338 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1339 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1340 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1341 DataType::Int8 => self.check_bounds::<i8>(max_value),
1342 DataType::Int16 => self.check_bounds::<i16>(max_value),
1343 DataType::Int32 => self.check_bounds::<i32>(max_value),
1344 DataType::Int64 => self.check_bounds::<i64>(max_value),
1345 _ => unreachable!(),
1346 }
1347 }
1348 DataType::RunEndEncoded(run_ends, _values) => {
1349 let run_ends_data = self.child_data()[0].clone();
1350 match run_ends.data_type() {
1351 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1352 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1353 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1354 _ => unreachable!(),
1355 }
1356 }
1357 _ => {
1358 Ok(())
1360 }
1361 }
1362 }
1363
1364 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1375 where
1376 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1377 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1378 {
1379 self.typed_offsets::<T>()?
1380 .iter()
1381 .enumerate()
1382 .map(|(i, x)| {
1383 let r = x.to_usize().ok_or_else(|| {
1385 ArrowError::InvalidArgumentError(format!(
1386 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1387 );
1388 match r {
1390 Ok(n) if n <= offset_limit => Ok((i, n)),
1391 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1392 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1393 ),
1394 Err(e) => Err(e),
1395 }
1396 })
1397 .scan(0_usize, |start, end| {
1398 match end {
1400 Ok((i, end)) if *start <= end => {
1401 let range = Some(Ok((i, *start..end)));
1402 *start = end;
1403 range
1404 }
1405 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1406 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1407 i - 1, start, end))
1408 )),
1409 Err(err) => Some(Err(err)),
1410 }
1411 })
1412 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1414 let (item_index, range) = res?;
1415 validate(item_index-1, range)
1416 })
1417 }
1418
1419 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1422 where
1423 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1424 {
1425 let values_buffer = &self.buffers[1].as_slice();
1426 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1427 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1429 if !values_str.is_char_boundary(range.start)
1430 || !values_str.is_char_boundary(range.end)
1431 {
1432 return Err(ArrowError::InvalidArgumentError(format!(
1433 "incomplete utf-8 byte sequence from index {string_index}"
1434 )));
1435 }
1436 Ok(())
1437 })
1438 } else {
1439 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1441 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1442 ArrowError::InvalidArgumentError(format!(
1443 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1444 ))
1445 })?;
1446 Ok(())
1447 })
1448 }
1449 }
1450
1451 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1454 where
1455 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1456 {
1457 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1458 Ok(())
1461 })
1462 }
1463
1464 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1467 where
1468 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1469 {
1470 let required_len = self.len + self.offset;
1471 let buffer = &self.buffers[0];
1472
1473 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1476
1477 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1479
1480 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1481 if self.is_null(i) {
1483 return Ok(());
1484 }
1485 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1486 ArrowError::InvalidArgumentError(format!(
1487 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1488 ))
1489 })?;
1490
1491 if dict_index < 0 || dict_index > max_value {
1492 return Err(ArrowError::InvalidArgumentError(format!(
1493 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1494 )));
1495 }
1496 Ok(())
1497 })
1498 }
1499
1500 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1502 where
1503 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1504 {
1505 let values = self.typed_buffer::<T>(0, self.len)?;
1506 let mut prev_value: i64 = 0_i64;
1507 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1508 let value: i64 = inp_value.try_into().map_err(|_| {
1509 ArrowError::InvalidArgumentError(format!(
1510 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1511 ))
1512 })?;
1513 if value <= 0_i64 {
1514 return Err(ArrowError::InvalidArgumentError(format!(
1515 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1516 )));
1517 }
1518 if ix > 0 && value <= prev_value {
1519 return Err(ArrowError::InvalidArgumentError(format!(
1520 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1521 )));
1522 }
1523
1524 prev_value = value;
1525 Ok(())
1526 })?;
1527
1528 if prev_value.as_usize() < (self.offset + self.len) {
1529 return Err(ArrowError::InvalidArgumentError(format!(
1530 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1531 self.offset + self.len
1532 )));
1533 }
1534 Ok(())
1535 }
1536
1537 pub fn ptr_eq(&self, other: &Self) -> bool {
1541 if self.offset != other.offset
1542 || self.len != other.len
1543 || self.data_type != other.data_type
1544 || self.buffers.len() != other.buffers.len()
1545 || self.child_data.len() != other.child_data.len()
1546 {
1547 return false;
1548 }
1549
1550 match (&self.nulls, &other.nulls) {
1551 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1552 (Some(_), None) | (None, Some(_)) => return false,
1553 _ => {}
1554 };
1555
1556 if !self
1557 .buffers
1558 .iter()
1559 .zip(other.buffers.iter())
1560 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1561 {
1562 return false;
1563 }
1564
1565 self.child_data
1566 .iter()
1567 .zip(other.child_data.iter())
1568 .all(|(a, b)| a.ptr_eq(b))
1569 }
1570
1571 pub fn into_builder(self) -> ArrayDataBuilder {
1573 self.into()
1574 }
1575}
1576
1577pub fn layout(data_type: &DataType) -> DataTypeLayout {
1580 use arrow_schema::IntervalUnit::*;
1583
1584 match data_type {
1585 DataType::Null => DataTypeLayout {
1586 buffers: vec![],
1587 can_contain_null_mask: false,
1588 variadic: false,
1589 },
1590 DataType::Boolean => DataTypeLayout {
1591 buffers: vec![BufferSpec::BitMap],
1592 can_contain_null_mask: true,
1593 variadic: false,
1594 },
1595 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1596 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1597 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1598 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1599 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1600 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1601 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1602 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1603 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1604 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1605 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1606 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1607 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1608 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1609 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1610 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1611 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1612 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1613 DataType::Interval(MonthDayNano) => {
1614 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1615 }
1616 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1617 DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1618 DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1619 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1620 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1621 DataType::FixedSizeBinary(size) => {
1622 let spec = BufferSpec::FixedWidth {
1623 byte_width: (*size).try_into().unwrap(),
1624 alignment: mem::align_of::<u8>(),
1625 };
1626 DataTypeLayout {
1627 buffers: vec![spec],
1628 can_contain_null_mask: true,
1629 variadic: false,
1630 }
1631 }
1632 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1633 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1634 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1635 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1636 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1637 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1639 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1640 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1641 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1642 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1643 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1646 let type_ids = BufferSpec::FixedWidth {
1647 byte_width: mem::size_of::<i8>(),
1648 alignment: mem::align_of::<i8>(),
1649 };
1650
1651 DataTypeLayout {
1652 buffers: match mode {
1653 UnionMode::Sparse => {
1654 vec![type_ids]
1655 }
1656 UnionMode::Dense => {
1657 vec![
1658 type_ids,
1659 BufferSpec::FixedWidth {
1660 byte_width: mem::size_of::<i32>(),
1661 alignment: mem::align_of::<i32>(),
1662 },
1663 ]
1664 }
1665 },
1666 can_contain_null_mask: false,
1667 variadic: false,
1668 }
1669 }
1670 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1671 }
1672}
1673
1674#[derive(Debug, PartialEq, Eq)]
1676pub struct DataTypeLayout {
1678 pub buffers: Vec<BufferSpec>,
1680
1681 pub can_contain_null_mask: bool,
1683
1684 pub variadic: bool,
1688}
1689
1690impl DataTypeLayout {
1691 pub fn new_fixed_width<T>() -> Self {
1693 Self {
1694 buffers: vec![BufferSpec::FixedWidth {
1695 byte_width: mem::size_of::<T>(),
1696 alignment: mem::align_of::<T>(),
1697 }],
1698 can_contain_null_mask: true,
1699 variadic: false,
1700 }
1701 }
1702
1703 pub fn new_nullable_empty() -> Self {
1706 Self {
1707 buffers: vec![],
1708 can_contain_null_mask: true,
1709 variadic: false,
1710 }
1711 }
1712
1713 pub fn new_empty() -> Self {
1716 Self {
1717 buffers: vec![],
1718 can_contain_null_mask: false,
1719 variadic: false,
1720 }
1721 }
1722
1723 pub fn new_binary<T>() -> Self {
1727 Self {
1728 buffers: vec![
1729 BufferSpec::FixedWidth {
1731 byte_width: mem::size_of::<T>(),
1732 alignment: mem::align_of::<T>(),
1733 },
1734 BufferSpec::VariableWidth,
1736 ],
1737 can_contain_null_mask: true,
1738 variadic: false,
1739 }
1740 }
1741
1742 pub fn new_view() -> Self {
1744 Self {
1745 buffers: vec![BufferSpec::FixedWidth {
1746 byte_width: mem::size_of::<u128>(),
1747 alignment: mem::align_of::<u128>(),
1748 }],
1749 can_contain_null_mask: true,
1750 variadic: true,
1751 }
1752 }
1753
1754 pub fn new_list_view<T>() -> Self {
1756 Self {
1757 buffers: vec![
1758 BufferSpec::FixedWidth {
1759 byte_width: mem::size_of::<T>(),
1760 alignment: mem::align_of::<T>(),
1761 },
1762 BufferSpec::FixedWidth {
1763 byte_width: mem::size_of::<T>(),
1764 alignment: mem::align_of::<T>(),
1765 },
1766 ],
1767 can_contain_null_mask: true,
1768 variadic: true,
1769 }
1770 }
1771}
1772
1773#[derive(Debug, PartialEq, Eq)]
1775pub enum BufferSpec {
1776 FixedWidth {
1787 byte_width: usize,
1789 alignment: usize,
1791 },
1792 VariableWidth,
1794 BitMap,
1800 #[allow(dead_code)]
1803 AlwaysNull,
1804}
1805
1806impl PartialEq for ArrayData {
1807 fn eq(&self, other: &Self) -> bool {
1808 equal::equal(self, other)
1809 }
1810}
1811
1812#[derive(Debug, Clone)]
1831#[doc(hidden)]
1832pub struct UnsafeFlag(bool);
1833
1834impl UnsafeFlag {
1835 #[inline]
1839 pub const fn new() -> Self {
1840 Self(false)
1841 }
1842
1843 #[inline]
1853 pub unsafe fn set(&mut self, val: bool) {
1854 self.0 = val;
1855 }
1856
1857 #[inline]
1859 pub fn get(&self) -> bool {
1860 self.0
1861 }
1862}
1863
1864impl Default for UnsafeFlag {
1866 fn default() -> Self {
1867 Self::new()
1868 }
1869}
1870
1871#[derive(Debug)]
1873pub struct ArrayDataBuilder {
1874 data_type: DataType,
1875 len: usize,
1876 null_count: Option<usize>,
1877 null_bit_buffer: Option<Buffer>,
1878 nulls: Option<NullBuffer>,
1879 offset: usize,
1880 buffers: Vec<Buffer>,
1881 child_data: Vec<ArrayData>,
1882 align_buffers: bool,
1886 skip_validation: UnsafeFlag,
1896}
1897
1898impl ArrayDataBuilder {
1899 #[inline]
1900 pub const fn new(data_type: DataType) -> Self {
1902 Self {
1903 data_type,
1904 len: 0,
1905 null_count: None,
1906 null_bit_buffer: None,
1907 nulls: None,
1908 offset: 0,
1909 buffers: vec![],
1910 child_data: vec![],
1911 align_buffers: false,
1912 skip_validation: UnsafeFlag::new(),
1913 }
1914 }
1915
1916 pub fn data_type(self, data_type: DataType) -> Self {
1918 Self { data_type, ..self }
1919 }
1920
1921 #[inline]
1922 #[allow(clippy::len_without_is_empty)]
1923 pub const fn len(mut self, n: usize) -> Self {
1925 self.len = n;
1926 self
1927 }
1928
1929 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1931 self.nulls = nulls;
1932 self.null_count = None;
1933 self.null_bit_buffer = None;
1934 self
1935 }
1936
1937 pub fn null_count(mut self, null_count: usize) -> Self {
1939 self.null_count = Some(null_count);
1940 self
1941 }
1942
1943 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1945 self.nulls = None;
1946 self.null_bit_buffer = buf;
1947 self
1948 }
1949
1950 #[inline]
1952 pub const fn offset(mut self, n: usize) -> Self {
1953 self.offset = n;
1954 self
1955 }
1956
1957 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1959 self.buffers = v;
1960 self
1961 }
1962
1963 pub fn add_buffer(mut self, b: Buffer) -> Self {
1965 self.buffers.push(b);
1966 self
1967 }
1968
1969 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1971 self.buffers.extend(bs);
1972 self
1973 }
1974
1975 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1977 self.child_data = v;
1978 self
1979 }
1980
1981 pub fn add_child_data(mut self, r: ArrayData) -> Self {
1983 self.child_data.push(r);
1984 self
1985 }
1986
1987 pub unsafe fn build_unchecked(self) -> ArrayData {
2002 self.skip_validation(true).build().unwrap()
2003 }
2004
2005 pub fn build(self) -> Result<ArrayData, ArrowError> {
2014 let Self {
2015 data_type,
2016 len,
2017 null_count,
2018 null_bit_buffer,
2019 nulls,
2020 offset,
2021 buffers,
2022 child_data,
2023 align_buffers,
2024 skip_validation,
2025 } = self;
2026
2027 let nulls = nulls
2028 .or_else(|| {
2029 let buffer = null_bit_buffer?;
2030 let buffer = BooleanBuffer::new(buffer, offset, len);
2031 Some(match null_count {
2032 Some(n) => {
2033 unsafe { NullBuffer::new_unchecked(buffer, n) }
2035 }
2036 None => NullBuffer::new(buffer),
2037 })
2038 })
2039 .filter(|b| b.null_count() != 0);
2040
2041 let mut data = ArrayData {
2042 data_type,
2043 len,
2044 offset,
2045 buffers,
2046 child_data,
2047 nulls,
2048 };
2049
2050 if align_buffers {
2051 data.align_buffers();
2052 }
2053
2054 if !skip_validation.get() || cfg!(feature = "force_validate") {
2056 data.validate_data()?;
2057 }
2058 Ok(data)
2059 }
2060
2061 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2063 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2064 self.align_buffers(true).build()
2065 }
2066
2067 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2083 self.align_buffers = align_buffers;
2084 self
2085 }
2086
2087 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2101 self.skip_validation.set(skip_validation);
2102 self
2103 }
2104}
2105
2106impl From<ArrayData> for ArrayDataBuilder {
2107 fn from(d: ArrayData) -> Self {
2108 Self {
2109 data_type: d.data_type,
2110 len: d.len,
2111 offset: d.offset,
2112 buffers: d.buffers,
2113 child_data: d.child_data,
2114 nulls: d.nulls,
2115 null_bit_buffer: None,
2116 null_count: None,
2117 align_buffers: false,
2118 skip_validation: UnsafeFlag::new(),
2119 }
2120 }
2121}
2122
2123#[cfg(test)]
2124mod tests {
2125 use super::*;
2126 use arrow_schema::{Field, Fields};
2127
2128 fn make_i32_buffer(n: usize) -> Buffer {
2132 Buffer::from_slice_ref(vec![42i32; n])
2133 }
2134
2135 fn make_f32_buffer(n: usize) -> Buffer {
2137 Buffer::from_slice_ref(vec![42f32; n])
2138 }
2139
2140 #[test]
2141 fn test_builder() {
2142 let v = (0..25).collect::<Vec<i32>>();
2144 let b1 = Buffer::from_slice_ref(&v);
2145 let arr_data = ArrayData::builder(DataType::Int32)
2146 .len(20)
2147 .offset(5)
2148 .add_buffer(b1)
2149 .null_bit_buffer(Some(Buffer::from([
2150 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2151 ])))
2152 .build()
2153 .unwrap();
2154
2155 assert_eq!(20, arr_data.len());
2156 assert_eq!(10, arr_data.null_count());
2157 assert_eq!(5, arr_data.offset());
2158 assert_eq!(1, arr_data.buffers().len());
2159 assert_eq!(
2160 Buffer::from_slice_ref(&v).as_slice(),
2161 arr_data.buffers()[0].as_slice()
2162 );
2163 }
2164
2165 #[test]
2166 fn test_builder_with_child_data() {
2167 let child_arr_data = ArrayData::try_new(
2168 DataType::Int32,
2169 5,
2170 None,
2171 0,
2172 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2173 vec![],
2174 )
2175 .unwrap();
2176
2177 let field = Arc::new(Field::new("x", DataType::Int32, true));
2178 let data_type = DataType::Struct(vec![field].into());
2179
2180 let arr_data = ArrayData::builder(data_type)
2181 .len(5)
2182 .offset(0)
2183 .add_child_data(child_arr_data.clone())
2184 .build()
2185 .unwrap();
2186
2187 assert_eq!(5, arr_data.len());
2188 assert_eq!(1, arr_data.child_data().len());
2189 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2190 }
2191
2192 #[test]
2193 fn test_null_count() {
2194 let mut bit_v: [u8; 2] = [0; 2];
2195 bit_util::set_bit(&mut bit_v, 0);
2196 bit_util::set_bit(&mut bit_v, 3);
2197 bit_util::set_bit(&mut bit_v, 10);
2198 let arr_data = ArrayData::builder(DataType::Int32)
2199 .len(16)
2200 .add_buffer(make_i32_buffer(16))
2201 .null_bit_buffer(Some(Buffer::from(bit_v)))
2202 .build()
2203 .unwrap();
2204 assert_eq!(13, arr_data.null_count());
2205
2206 let mut bit_v: [u8; 2] = [0; 2];
2208 bit_util::set_bit(&mut bit_v, 0);
2209 bit_util::set_bit(&mut bit_v, 3);
2210 bit_util::set_bit(&mut bit_v, 10);
2211 let arr_data = ArrayData::builder(DataType::Int32)
2212 .len(12)
2213 .offset(2)
2214 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2216 .build()
2217 .unwrap();
2218 assert_eq!(10, arr_data.null_count());
2219 }
2220
2221 #[test]
2222 fn test_null_buffer_ref() {
2223 let mut bit_v: [u8; 2] = [0; 2];
2224 bit_util::set_bit(&mut bit_v, 0);
2225 bit_util::set_bit(&mut bit_v, 3);
2226 bit_util::set_bit(&mut bit_v, 10);
2227 let arr_data = ArrayData::builder(DataType::Int32)
2228 .len(16)
2229 .add_buffer(make_i32_buffer(16))
2230 .null_bit_buffer(Some(Buffer::from(bit_v)))
2231 .build()
2232 .unwrap();
2233 assert!(arr_data.nulls().is_some());
2234 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2235 }
2236
2237 #[test]
2238 fn test_slice() {
2239 let mut bit_v: [u8; 2] = [0; 2];
2240 bit_util::set_bit(&mut bit_v, 0);
2241 bit_util::set_bit(&mut bit_v, 3);
2242 bit_util::set_bit(&mut bit_v, 10);
2243 let data = ArrayData::builder(DataType::Int32)
2244 .len(16)
2245 .add_buffer(make_i32_buffer(16))
2246 .null_bit_buffer(Some(Buffer::from(bit_v)))
2247 .build()
2248 .unwrap();
2249 let new_data = data.slice(1, 15);
2250 assert_eq!(data.len() - 1, new_data.len());
2251 assert_eq!(1, new_data.offset());
2252 assert_eq!(data.null_count(), new_data.null_count());
2253
2254 let new_data = new_data.slice(1, 14);
2256 assert_eq!(data.len() - 2, new_data.len());
2257 assert_eq!(2, new_data.offset());
2258 assert_eq!(data.null_count() - 1, new_data.null_count());
2259 }
2260
2261 #[test]
2262 fn test_equality() {
2263 let int_data = ArrayData::builder(DataType::Int32)
2264 .len(1)
2265 .add_buffer(make_i32_buffer(1))
2266 .build()
2267 .unwrap();
2268
2269 let float_data = ArrayData::builder(DataType::Float32)
2270 .len(1)
2271 .add_buffer(make_f32_buffer(1))
2272 .build()
2273 .unwrap();
2274 assert_ne!(int_data, float_data);
2275 assert!(!int_data.ptr_eq(&float_data));
2276 assert!(int_data.ptr_eq(&int_data));
2277
2278 #[allow(clippy::redundant_clone)]
2279 let int_data_clone = int_data.clone();
2280 assert_eq!(int_data, int_data_clone);
2281 assert!(int_data.ptr_eq(&int_data_clone));
2282 assert!(int_data_clone.ptr_eq(&int_data));
2283
2284 let int_data_slice = int_data_clone.slice(1, 0);
2285 assert!(int_data_slice.ptr_eq(&int_data_slice));
2286 assert!(!int_data.ptr_eq(&int_data_slice));
2287 assert!(!int_data_slice.ptr_eq(&int_data));
2288
2289 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2290 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2291 let string_data = ArrayData::try_new(
2292 DataType::Utf8,
2293 3,
2294 Some(Buffer::from_iter(vec![true, false, true])),
2295 0,
2296 vec![offsets_buffer, data_buffer],
2297 vec![],
2298 )
2299 .unwrap();
2300
2301 assert_ne!(float_data, string_data);
2302 assert!(!float_data.ptr_eq(&string_data));
2303
2304 assert!(string_data.ptr_eq(&string_data));
2305
2306 #[allow(clippy::redundant_clone)]
2307 let string_data_cloned = string_data.clone();
2308 assert!(string_data_cloned.ptr_eq(&string_data));
2309 assert!(string_data.ptr_eq(&string_data_cloned));
2310
2311 let string_data_slice = string_data.slice(1, 2);
2312 assert!(string_data_slice.ptr_eq(&string_data_slice));
2313 assert!(!string_data_slice.ptr_eq(&string_data))
2314 }
2315
2316 #[test]
2317 fn test_slice_memory_size() {
2318 let mut bit_v: [u8; 2] = [0; 2];
2319 bit_util::set_bit(&mut bit_v, 0);
2320 bit_util::set_bit(&mut bit_v, 3);
2321 bit_util::set_bit(&mut bit_v, 10);
2322 let data = ArrayData::builder(DataType::Int32)
2323 .len(16)
2324 .add_buffer(make_i32_buffer(16))
2325 .null_bit_buffer(Some(Buffer::from(bit_v)))
2326 .build()
2327 .unwrap();
2328 let new_data = data.slice(1, 14);
2329 assert_eq!(
2330 data.get_slice_memory_size().unwrap() - 8,
2331 new_data.get_slice_memory_size().unwrap()
2332 );
2333 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2334 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2335 let string_data = ArrayData::try_new(
2336 DataType::Utf8,
2337 3,
2338 Some(Buffer::from_iter(vec![true, false, true])),
2339 0,
2340 vec![offsets_buffer, data_buffer],
2341 vec![],
2342 )
2343 .unwrap();
2344 let string_data_slice = string_data.slice(1, 2);
2345 assert_eq!(
2347 string_data.get_slice_memory_size().unwrap() - 6,
2348 string_data_slice.get_slice_memory_size().unwrap()
2349 );
2350 }
2351
2352 #[test]
2353 fn test_count_nulls() {
2354 let buffer = Buffer::from([0b00010110, 0b10011111]);
2355 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2356 let count = count_nulls(Some(&buffer), 0, 16);
2357 assert_eq!(count, 7);
2358
2359 let count = count_nulls(Some(&buffer), 4, 8);
2360 assert_eq!(count, 3);
2361 }
2362
2363 #[test]
2364 fn test_contains_nulls() {
2365 let buffer: Buffer =
2366 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2367 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2368 assert!(contains_nulls(Some(&buffer), 0, 6));
2369 assert!(contains_nulls(Some(&buffer), 0, 3));
2370 assert!(!contains_nulls(Some(&buffer), 3, 2));
2371 assert!(!contains_nulls(Some(&buffer), 0, 0));
2372 }
2373
2374 #[test]
2375 fn test_alignment() {
2376 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2377 let sliced = buffer.slice(1);
2378
2379 let mut data = ArrayData {
2380 data_type: DataType::Int32,
2381 len: 0,
2382 offset: 0,
2383 buffers: vec![buffer],
2384 child_data: vec![],
2385 nulls: None,
2386 };
2387 data.validate_full().unwrap();
2388
2389 data.buffers[0] = sliced;
2391 let err = data.validate().unwrap_err();
2392
2393 assert_eq!(
2394 err.to_string(),
2395 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2396 );
2397
2398 data.align_buffers();
2399 data.validate_full().unwrap();
2400 }
2401
2402 #[test]
2403 fn test_alignment_struct() {
2404 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2405 let sliced = buffer.slice(1);
2406
2407 let child_data = ArrayData {
2408 data_type: DataType::Int32,
2409 len: 0,
2410 offset: 0,
2411 buffers: vec![buffer],
2412 child_data: vec![],
2413 nulls: None,
2414 };
2415
2416 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2417 let mut data = ArrayData {
2418 data_type: schema,
2419 len: 0,
2420 offset: 0,
2421 buffers: vec![],
2422 child_data: vec![child_data],
2423 nulls: None,
2424 };
2425 data.validate_full().unwrap();
2426
2427 data.child_data[0].buffers[0] = sliced;
2429 let err = data.validate().unwrap_err();
2430
2431 assert_eq!(
2432 err.to_string(),
2433 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2434 );
2435
2436 data.align_buffers();
2437 data.validate_full().unwrap();
2438 }
2439
2440 #[test]
2441 fn test_null_view_types() {
2442 let array_len = 32;
2443 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2444 assert_eq!(array.len(), array_len);
2445 for i in 0..array.len() {
2446 assert!(array.is_null(i));
2447 }
2448
2449 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2450 assert_eq!(array.len(), array_len);
2451 for i in 0..array.len() {
2452 assert!(array.is_null(i));
2453 }
2454 }
2455}