1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35 null_bit_buffer: Option<&NullBuffer>,
36 offset: usize,
37 len: usize,
38) -> bool {
39 match null_bit_buffer {
40 Some(buffer) => {
41 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42 Some((start, end)) => start != 0 || end != len,
43 None => len != 0, }
45 }
46 None => false, }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52 null_bit_buffer: Option<&NullBuffer>,
53 offset: usize,
54 len: usize,
55) -> usize {
56 if let Some(buf) = null_bit_buffer {
57 let buffer = buf.buffer();
58 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59 } else {
60 0
61 }
62}
63
64#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67 let empty_buffer = MutableBuffer::new(0);
68 match data_type {
69 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70 DataType::Boolean => {
71 let bytes = bit_util::ceil(capacity, 8);
72 let buffer = MutableBuffer::new(bytes);
73 [buffer, empty_buffer]
74 }
75 DataType::UInt8
76 | DataType::UInt16
77 | DataType::UInt32
78 | DataType::UInt64
79 | DataType::Int8
80 | DataType::Int16
81 | DataType::Int32
82 | DataType::Int64
83 | DataType::Float16
84 | DataType::Float32
85 | DataType::Float64
86 | DataType::Decimal128(_, _)
87 | DataType::Decimal256(_, _)
88 | DataType::Date32
89 | DataType::Time32(_)
90 | DataType::Date64
91 | DataType::Time64(_)
92 | DataType::Duration(_)
93 | DataType::Timestamp(_, _)
94 | DataType::Interval(_) => [
95 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
96 empty_buffer,
97 ],
98 DataType::Utf8 | DataType::Binary => {
99 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
100 buffer.push(0i32);
102 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
103 }
104 DataType::LargeUtf8 | DataType::LargeBinary => {
105 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
106 buffer.push(0i64);
108 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
109 }
110 DataType::BinaryView | DataType::Utf8View => [
111 MutableBuffer::new(capacity * mem::size_of::<u128>()),
112 empty_buffer,
113 ],
114 DataType::List(_) | DataType::Map(_, _) => {
115 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
117 buffer.push(0i32);
118 [buffer, empty_buffer]
119 }
120 DataType::ListView(_) => [
121 MutableBuffer::new(capacity * mem::size_of::<i32>()),
122 MutableBuffer::new(capacity * mem::size_of::<i32>()),
123 ],
124 DataType::LargeList(_) => {
125 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
127 buffer.push(0i64);
128 [buffer, empty_buffer]
129 }
130 DataType::LargeListView(_) => [
131 MutableBuffer::new(capacity * mem::size_of::<i64>()),
132 MutableBuffer::new(capacity * mem::size_of::<i64>()),
133 ],
134 DataType::FixedSizeBinary(size) => {
135 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
136 }
137 DataType::Dictionary(k, _) => [
138 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
139 empty_buffer,
140 ],
141 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
142 [empty_buffer, MutableBuffer::new(0)]
143 }
144 DataType::Union(_, mode) => {
145 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
146 match mode {
147 UnionMode::Sparse => [type_ids, empty_buffer],
148 UnionMode::Dense => {
149 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
150 [type_ids, offsets]
151 }
152 }
153 }
154 }
155}
156
157#[derive(Debug, Clone)]
203pub struct ArrayData {
204 data_type: DataType,
206
207 len: usize,
209
210 offset: usize,
215
216 buffers: Vec<Buffer>,
229
230 child_data: Vec<ArrayData>,
240
241 nulls: Option<NullBuffer>,
249}
250
251pub type ArrayDataRef = Arc<ArrayData>;
253
254impl ArrayData {
255 pub unsafe fn new_unchecked(
272 data_type: DataType,
273 len: usize,
274 null_count: Option<usize>,
275 null_bit_buffer: Option<Buffer>,
276 offset: usize,
277 buffers: Vec<Buffer>,
278 child_data: Vec<ArrayData>,
279 ) -> Self {
280 let mut skip_validation = UnsafeFlag::new();
281 skip_validation.set(true);
283
284 ArrayDataBuilder {
285 data_type,
286 len,
287 null_count,
288 null_bit_buffer,
289 nulls: None,
290 offset,
291 buffers,
292 child_data,
293 align_buffers: false,
294 skip_validation,
295 }
296 .build()
297 .unwrap()
298 }
299
300 pub fn try_new(
311 data_type: DataType,
312 len: usize,
313 null_bit_buffer: Option<Buffer>,
314 offset: usize,
315 buffers: Vec<Buffer>,
316 child_data: Vec<ArrayData>,
317 ) -> Result<Self, ArrowError> {
318 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
322 let needed_len = bit_util::ceil(len + offset, 8);
323 if null_bit_buffer.len() < needed_len {
324 return Err(ArrowError::InvalidArgumentError(format!(
325 "null_bit_buffer size too small. got {} needed {}",
326 null_bit_buffer.len(),
327 needed_len
328 )));
329 }
330 }
331 let new_self = unsafe {
333 Self::new_unchecked(
334 data_type,
335 len,
336 None,
337 null_bit_buffer,
338 offset,
339 buffers,
340 child_data,
341 )
342 };
343
344 new_self.validate_data()?;
349 Ok(new_self)
350 }
351
352 #[inline]
354 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
355 ArrayDataBuilder::new(data_type)
356 }
357
358 #[inline]
360 pub const fn data_type(&self) -> &DataType {
361 &self.data_type
362 }
363
364 pub fn buffers(&self) -> &[Buffer] {
366 &self.buffers
367 }
368
369 pub fn child_data(&self) -> &[ArrayData] {
372 &self.child_data[..]
373 }
374
375 #[inline]
377 pub fn is_null(&self, i: usize) -> bool {
378 match &self.nulls {
379 Some(v) => v.is_null(i),
380 None => false,
381 }
382 }
383
384 #[inline]
388 pub fn nulls(&self) -> Option<&NullBuffer> {
389 self.nulls.as_ref()
390 }
391
392 #[inline]
394 pub fn is_valid(&self, i: usize) -> bool {
395 !self.is_null(i)
396 }
397
398 #[inline]
400 pub const fn len(&self) -> usize {
401 self.len
402 }
403
404 #[inline]
406 pub const fn is_empty(&self) -> bool {
407 self.len == 0
408 }
409
410 #[inline]
412 pub const fn offset(&self) -> usize {
413 self.offset
414 }
415
416 #[inline]
418 pub fn null_count(&self) -> usize {
419 self.nulls
420 .as_ref()
421 .map(|x| x.null_count())
422 .unwrap_or_default()
423 }
424
425 pub fn get_buffer_memory_size(&self) -> usize {
437 let mut size = 0;
438 for buffer in &self.buffers {
439 size += buffer.capacity();
440 }
441 if let Some(bitmap) = &self.nulls {
442 size += bitmap.buffer().capacity()
443 }
444 for child in &self.child_data {
445 size += child.get_buffer_memory_size();
446 }
447 size
448 }
449
450 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
463 let mut result: usize = 0;
464 let layout = layout(&self.data_type);
465
466 for spec in layout.buffers.iter() {
467 match spec {
468 BufferSpec::FixedWidth { byte_width, .. } => {
469 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
470 ArrowError::ComputeError(
471 "Integer overflow computing buffer size".to_string(),
472 )
473 })?;
474 result += buffer_size;
475 }
476 BufferSpec::VariableWidth => {
477 let buffer_len: usize;
478 match self.data_type {
479 DataType::Utf8 | DataType::Binary => {
480 let offsets = self.typed_offsets::<i32>()?;
481 buffer_len = (offsets[self.len] - offsets[0] ) as usize;
482 }
483 DataType::LargeUtf8 | DataType::LargeBinary => {
484 let offsets = self.typed_offsets::<i64>()?;
485 buffer_len = (offsets[self.len] - offsets[0]) as usize;
486 }
487 _ => {
488 return Err(ArrowError::NotYetImplemented(format!(
489 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
490 self.data_type
491 )))
492 }
493 };
494 result += buffer_len;
495 }
496 BufferSpec::BitMap => {
497 let buffer_size = bit_util::ceil(self.len, 8);
498 result += buffer_size;
499 }
500 BufferSpec::AlwaysNull => {
501 }
503 }
504 }
505
506 if self.nulls().is_some() {
507 result += bit_util::ceil(self.len, 8);
508 }
509
510 for child in &self.child_data {
511 result += child.get_slice_memory_size()?;
512 }
513 Ok(result)
514 }
515
516 pub fn get_array_memory_size(&self) -> usize {
525 let mut size = mem::size_of_val(self);
526
527 for buffer in &self.buffers {
529 size += mem::size_of::<Buffer>();
530 size += buffer.capacity();
531 }
532 if let Some(nulls) = &self.nulls {
533 size += nulls.buffer().capacity();
534 }
535 for child in &self.child_data {
536 size += child.get_array_memory_size();
537 }
538
539 size
540 }
541
542 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
550 assert!((offset + length) <= self.len());
551
552 if let DataType::Struct(_) = self.data_type() {
553 let new_offset = self.offset + offset;
555 let new_data = ArrayData {
556 data_type: self.data_type().clone(),
557 len: length,
558 offset: new_offset,
559 buffers: self.buffers.clone(),
560 child_data: self
562 .child_data()
563 .iter()
564 .map(|data| data.slice(offset, length))
565 .collect(),
566 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
567 };
568
569 new_data
570 } else {
571 let mut new_data = self.clone();
572
573 new_data.len = length;
574 new_data.offset = offset + self.offset;
575 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
576
577 new_data
578 }
579 }
580
581 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
588 &self.buffers()[buffer].typed_data()[self.offset..]
589 }
590
591 pub fn new_null(data_type: &DataType, len: usize) -> Self {
593 let bit_len = bit_util::ceil(len, 8);
594 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
595
596 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
597 Some(width) => (vec![zeroed(width * len)], vec![], true),
598 None => match data_type {
599 DataType::Null => (vec![], vec![], false),
600 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
601 DataType::Binary | DataType::Utf8 => {
602 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
603 }
604 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
605 DataType::LargeBinary | DataType::LargeUtf8 => {
606 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
607 }
608 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
609 DataType::List(f) | DataType::Map(f, _) => (
610 vec![zeroed((len + 1) * 4)],
611 vec![ArrayData::new_empty(f.data_type())],
612 true,
613 ),
614 DataType::LargeList(f) => (
615 vec![zeroed((len + 1) * 8)],
616 vec![ArrayData::new_empty(f.data_type())],
617 true,
618 ),
619 DataType::FixedSizeList(f, list_len) => (
620 vec![],
621 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
622 true,
623 ),
624 DataType::Struct(fields) => (
625 vec![],
626 fields
627 .iter()
628 .map(|f| Self::new_null(f.data_type(), len))
629 .collect(),
630 true,
631 ),
632 DataType::Dictionary(k, v) => (
633 vec![zeroed(k.primitive_width().unwrap() * len)],
634 vec![ArrayData::new_empty(v.as_ref())],
635 true,
636 ),
637 DataType::Union(f, mode) => {
638 let (id, _) = f.iter().next().unwrap();
639 let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
640 let buffers = match mode {
641 UnionMode::Sparse => vec![ids],
642 UnionMode::Dense => {
643 let end_offset = i32::from_usize(len).unwrap();
644 vec![ids, Buffer::from_iter(0_i32..end_offset)]
645 }
646 };
647
648 let children = f
649 .iter()
650 .enumerate()
651 .map(|(idx, (_, f))| {
652 if idx == 0 || *mode == UnionMode::Sparse {
653 Self::new_null(f.data_type(), len)
654 } else {
655 Self::new_empty(f.data_type())
656 }
657 })
658 .collect();
659
660 (buffers, children, false)
661 }
662 DataType::RunEndEncoded(r, v) => {
663 let runs = match r.data_type() {
664 DataType::Int16 => {
665 let i = i16::from_usize(len).expect("run overflow");
666 Buffer::from_slice_ref([i])
667 }
668 DataType::Int32 => {
669 let i = i32::from_usize(len).expect("run overflow");
670 Buffer::from_slice_ref([i])
671 }
672 DataType::Int64 => {
673 let i = i64::from_usize(len).expect("run overflow");
674 Buffer::from_slice_ref([i])
675 }
676 dt => unreachable!("Invalid run ends data type {dt}"),
677 };
678
679 let builder = ArrayData::builder(r.data_type().clone())
680 .len(1)
681 .buffers(vec![runs]);
682
683 let runs = unsafe { builder.build_unchecked() };
686 (
687 vec![],
688 vec![runs, ArrayData::new_null(v.data_type(), 1)],
689 false,
690 )
691 }
692 d => unreachable!("{d}"),
693 },
694 };
695
696 let mut builder = ArrayDataBuilder::new(data_type.clone())
697 .len(len)
698 .buffers(buffers)
699 .child_data(child_data);
700
701 if has_nulls {
702 builder = builder.nulls(Some(NullBuffer::new_null(len)))
703 }
704
705 unsafe { builder.build_unchecked() }
708 }
709
710 pub fn new_empty(data_type: &DataType) -> Self {
712 Self::new_null(data_type, 0)
713 }
714
715 pub fn align_buffers(&mut self) {
724 let layout = layout(&self.data_type);
725 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
726 if let BufferSpec::FixedWidth { alignment, .. } = spec {
727 if buffer.as_ptr().align_offset(*alignment) != 0 {
728 *buffer = Buffer::from_slice_ref(buffer.as_ref());
729 }
730 }
731 }
732 for data in self.child_data.iter_mut() {
734 data.align_buffers()
735 }
736 }
737
738 pub fn validate(&self) -> Result<(), ArrowError> {
749 let len_plus_offset = self.len + self.offset;
751
752 let layout = layout(&self.data_type);
754
755 if !layout.can_contain_null_mask && self.nulls.is_some() {
756 return Err(ArrowError::InvalidArgumentError(format!(
757 "Arrays of type {:?} cannot contain a null bitmask",
758 self.data_type,
759 )));
760 }
761
762 if self.buffers.len() < layout.buffers.len()
764 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
765 {
766 return Err(ArrowError::InvalidArgumentError(format!(
767 "Expected {} buffers in array of type {:?}, got {}",
768 layout.buffers.len(),
769 self.data_type,
770 self.buffers.len(),
771 )));
772 }
773
774 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
775 match spec {
776 BufferSpec::FixedWidth {
777 byte_width,
778 alignment,
779 } => {
780 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
781
782 if buffer.len() < min_buffer_size {
783 return Err(ArrowError::InvalidArgumentError(format!(
784 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
785 min_buffer_size, i, self.data_type, buffer.len()
786 )));
787 }
788
789 let align_offset = buffer.as_ptr().align_offset(*alignment);
790 if align_offset != 0 {
791 return Err(ArrowError::InvalidArgumentError(format!(
792 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
793 self.data_type, align_offset.min(alignment - align_offset)
794 )));
795 }
796 }
797 BufferSpec::VariableWidth => {
798 }
802 BufferSpec::BitMap => {
803 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
804 if buffer.len() < min_buffer_size {
805 return Err(ArrowError::InvalidArgumentError(format!(
806 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
807 min_buffer_size, i, self.data_type, buffer.len()
808 )));
809 }
810 }
811 BufferSpec::AlwaysNull => {
812 }
814 }
815 }
816
817 if let Some(nulls) = self.nulls() {
819 if nulls.null_count() > self.len {
820 return Err(ArrowError::InvalidArgumentError(format!(
821 "null_count {} for an array exceeds length of {} elements",
822 nulls.null_count(),
823 self.len
824 )));
825 }
826
827 let actual_len = nulls.validity().len();
828 let needed_len = bit_util::ceil(len_plus_offset, 8);
829 if actual_len < needed_len {
830 return Err(ArrowError::InvalidArgumentError(format!(
831 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
832 )));
833 }
834
835 if nulls.len() != self.len {
836 return Err(ArrowError::InvalidArgumentError(format!(
837 "null buffer incorrect size. got {} expected {}",
838 nulls.len(),
839 self.len
840 )));
841 }
842 }
843
844 self.validate_child_data()?;
845
846 match &self.data_type {
848 DataType::Utf8 | DataType::Binary => {
849 self.validate_offsets::<i32>(self.buffers[1].len())?;
850 }
851 DataType::LargeUtf8 | DataType::LargeBinary => {
852 self.validate_offsets::<i64>(self.buffers[1].len())?;
853 }
854 DataType::Dictionary(key_type, _value_type) => {
855 if !DataType::is_dictionary_key_type(key_type) {
857 return Err(ArrowError::InvalidArgumentError(format!(
858 "Dictionary key type must be integer, but was {key_type}"
859 )));
860 }
861 }
862 DataType::RunEndEncoded(run_ends_type, _) => {
863 if run_ends_type.is_nullable() {
864 return Err(ArrowError::InvalidArgumentError(
865 "The nullable should be set to false for the field defining run_ends array.".to_string()
866 ));
867 }
868 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
869 return Err(ArrowError::InvalidArgumentError(format!(
870 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
871 run_ends_type.data_type()
872 )));
873 }
874 }
875 _ => {}
876 };
877
878 Ok(())
879 }
880
881 fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
888 if self.len == 0 && self.buffers[0].is_empty() {
890 return Ok(&[]);
891 }
892
893 self.typed_buffer(0, self.len + 1)
894 }
895
896 fn typed_buffer<T: ArrowNativeType + num::Num>(
898 &self,
899 idx: usize,
900 len: usize,
901 ) -> Result<&[T], ArrowError> {
902 let buffer = &self.buffers[idx];
903
904 let required_len = (len + self.offset) * mem::size_of::<T>();
905
906 if buffer.len() < required_len {
907 return Err(ArrowError::InvalidArgumentError(format!(
908 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
909 idx,
910 self.data_type,
911 required_len,
912 buffer.len()
913 )));
914 }
915
916 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
917 }
918
919 fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
922 &self,
923 values_length: usize,
924 ) -> Result<(), ArrowError> {
925 let offsets = self.typed_offsets::<T>()?;
927 if offsets.is_empty() {
928 return Ok(());
929 }
930
931 let first_offset = offsets[0].to_usize().ok_or_else(|| {
932 ArrowError::InvalidArgumentError(format!(
933 "Error converting offset[0] ({}) to usize for {}",
934 offsets[0], self.data_type
935 ))
936 })?;
937
938 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
939 ArrowError::InvalidArgumentError(format!(
940 "Error converting offset[{}] ({}) to usize for {}",
941 self.len, offsets[self.len], self.data_type
942 ))
943 })?;
944
945 if first_offset > values_length {
946 return Err(ArrowError::InvalidArgumentError(format!(
947 "First offset {} of {} is larger than values length {}",
948 first_offset, self.data_type, values_length,
949 )));
950 }
951
952 if last_offset > values_length {
953 return Err(ArrowError::InvalidArgumentError(format!(
954 "Last offset {} of {} is larger than values length {}",
955 last_offset, self.data_type, values_length,
956 )));
957 }
958
959 if first_offset > last_offset {
960 return Err(ArrowError::InvalidArgumentError(format!(
961 "First offset {} in {} is smaller than last offset {}",
962 first_offset, self.data_type, last_offset,
963 )));
964 }
965
966 Ok(())
967 }
968
969 fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
972 &self,
973 values_length: usize,
974 ) -> Result<(), ArrowError> {
975 let offsets: &[T] = self.typed_buffer(0, self.len)?;
976 let sizes: &[T] = self.typed_buffer(1, self.len)?;
977 for i in 0..values_length {
978 let size = sizes[i].to_usize().ok_or_else(|| {
979 ArrowError::InvalidArgumentError(format!(
980 "Error converting size[{}] ({}) to usize for {}",
981 i, sizes[i], self.data_type
982 ))
983 })?;
984 let offset = offsets[i].to_usize().ok_or_else(|| {
985 ArrowError::InvalidArgumentError(format!(
986 "Error converting offset[{}] ({}) to usize for {}",
987 i, offsets[i], self.data_type
988 ))
989 })?;
990 if size
991 .checked_add(offset)
992 .expect("Offset and size have exceeded the usize boundary")
993 > values_length
994 {
995 return Err(ArrowError::InvalidArgumentError(format!(
996 "Size {} at index {} is larger than the remaining values for {}",
997 size, i, self.data_type
998 )));
999 }
1000 }
1001 Ok(())
1002 }
1003
1004 fn validate_child_data(&self) -> Result<(), ArrowError> {
1006 match &self.data_type {
1007 DataType::List(field) | DataType::Map(field, _) => {
1008 let values_data = self.get_single_valid_child_data(field.data_type())?;
1009 self.validate_offsets::<i32>(values_data.len)?;
1010 Ok(())
1011 }
1012 DataType::LargeList(field) => {
1013 let values_data = self.get_single_valid_child_data(field.data_type())?;
1014 self.validate_offsets::<i64>(values_data.len)?;
1015 Ok(())
1016 }
1017 DataType::ListView(field) => {
1018 let values_data = self.get_single_valid_child_data(field.data_type())?;
1019 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1020 Ok(())
1021 }
1022 DataType::LargeListView(field) => {
1023 let values_data = self.get_single_valid_child_data(field.data_type())?;
1024 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1025 Ok(())
1026 }
1027 DataType::FixedSizeList(field, list_size) => {
1028 let values_data = self.get_single_valid_child_data(field.data_type())?;
1029
1030 let list_size: usize = (*list_size).try_into().map_err(|_| {
1031 ArrowError::InvalidArgumentError(format!(
1032 "{} has a negative list_size {}",
1033 self.data_type, list_size
1034 ))
1035 })?;
1036
1037 let expected_values_len = self.len
1038 .checked_mul(list_size)
1039 .expect("integer overflow computing expected number of expected values in FixedListSize");
1040
1041 if values_data.len < expected_values_len {
1042 return Err(ArrowError::InvalidArgumentError(format!(
1043 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1044 values_data.len, self.len, list_size, self.data_type
1045 )));
1046 }
1047
1048 Ok(())
1049 }
1050 DataType::Struct(fields) => {
1051 self.validate_num_child_data(fields.len())?;
1052 for (i, field) in fields.iter().enumerate() {
1053 let field_data = self.get_valid_child_data(i, field.data_type())?;
1054
1055 if field_data.len < self.len {
1057 return Err(ArrowError::InvalidArgumentError(format!(
1058 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1059 self.data_type, i, field.name(), field_data.len, self.len
1060 )));
1061 }
1062 }
1063 Ok(())
1064 }
1065 DataType::RunEndEncoded(run_ends_field, values_field) => {
1066 self.validate_num_child_data(2)?;
1067 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1068 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1069 if run_ends_data.len != values_data.len {
1070 return Err(ArrowError::InvalidArgumentError(format!(
1071 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1072 run_ends_data.len, values_data.len
1073 )));
1074 }
1075 if run_ends_data.nulls.is_some() {
1076 return Err(ArrowError::InvalidArgumentError(
1077 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1078 ));
1079 }
1080 Ok(())
1081 }
1082 DataType::Union(fields, mode) => {
1083 self.validate_num_child_data(fields.len())?;
1084
1085 for (i, (_, field)) in fields.iter().enumerate() {
1086 let field_data = self.get_valid_child_data(i, field.data_type())?;
1087
1088 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1089 return Err(ArrowError::InvalidArgumentError(format!(
1090 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1091 i, field_data.len, self.len + self.offset
1092 )));
1093 }
1094 }
1095 Ok(())
1096 }
1097 DataType::Dictionary(_key_type, value_type) => {
1098 self.get_single_valid_child_data(value_type)?;
1099 Ok(())
1100 }
1101 _ => {
1102 if !self.child_data.is_empty() {
1104 return Err(ArrowError::InvalidArgumentError(format!(
1105 "Expected no child arrays for type {} but got {}",
1106 self.data_type,
1107 self.child_data.len()
1108 )));
1109 }
1110 Ok(())
1111 }
1112 }
1113 }
1114
1115 fn get_single_valid_child_data(
1119 &self,
1120 expected_type: &DataType,
1121 ) -> Result<&ArrayData, ArrowError> {
1122 self.validate_num_child_data(1)?;
1123 self.get_valid_child_data(0, expected_type)
1124 }
1125
1126 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1128 if self.child_data.len() != expected_len {
1129 Err(ArrowError::InvalidArgumentError(format!(
1130 "Value data for {} should contain {} child data array(s), had {}",
1131 self.data_type,
1132 expected_len,
1133 self.child_data.len()
1134 )))
1135 } else {
1136 Ok(())
1137 }
1138 }
1139
1140 fn get_valid_child_data(
1143 &self,
1144 i: usize,
1145 expected_type: &DataType,
1146 ) -> Result<&ArrayData, ArrowError> {
1147 let values_data = self.child_data.get(i).ok_or_else(|| {
1148 ArrowError::InvalidArgumentError(format!(
1149 "{} did not have enough child arrays. Expected at least {} but had only {}",
1150 self.data_type,
1151 i + 1,
1152 self.child_data.len()
1153 ))
1154 })?;
1155
1156 if expected_type != &values_data.data_type {
1157 return Err(ArrowError::InvalidArgumentError(format!(
1158 "Child type mismatch for {}. Expected {} but child data had {}",
1159 self.data_type, expected_type, values_data.data_type
1160 )));
1161 }
1162
1163 values_data.validate()?;
1164 Ok(values_data)
1165 }
1166
1167 pub fn validate_data(&self) -> Result<(), ArrowError> {
1183 self.validate()?;
1184
1185 self.validate_nulls()?;
1186 self.validate_values()?;
1187 Ok(())
1188 }
1189
1190 pub fn validate_full(&self) -> Result<(), ArrowError> {
1195 self.validate_data()?;
1196 self.child_data
1198 .iter()
1199 .enumerate()
1200 .try_for_each(|(i, child_data)| {
1201 child_data.validate_full().map_err(|e| {
1202 ArrowError::InvalidArgumentError(format!(
1203 "{} child #{} invalid: {}",
1204 self.data_type, i, e
1205 ))
1206 })
1207 })?;
1208 Ok(())
1209 }
1210
1211 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1221 if let Some(nulls) = &self.nulls {
1222 let actual = nulls.len() - nulls.inner().count_set_bits();
1223 if actual != nulls.null_count() {
1224 return Err(ArrowError::InvalidArgumentError(format!(
1225 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1226 nulls.null_count(),
1227 actual
1228 )));
1229 }
1230 }
1231
1232 match &self.data_type {
1237 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1238 if !f.is_nullable() {
1239 self.validate_non_nullable(None, &self.child_data[0])?
1240 }
1241 }
1242 DataType::FixedSizeList(field, len) => {
1243 let child = &self.child_data[0];
1244 if !field.is_nullable() {
1245 match &self.nulls {
1246 Some(nulls) => {
1247 let element_len = *len as usize;
1248 let expanded = nulls.expand(element_len);
1249 self.validate_non_nullable(Some(&expanded), child)?;
1250 }
1251 None => self.validate_non_nullable(None, child)?,
1252 }
1253 }
1254 }
1255 DataType::Struct(fields) => {
1256 for (field, child) in fields.iter().zip(&self.child_data) {
1257 if !field.is_nullable() {
1258 self.validate_non_nullable(self.nulls(), child)?
1259 }
1260 }
1261 }
1262 _ => {}
1263 }
1264
1265 Ok(())
1266 }
1267
1268 fn validate_non_nullable(
1270 &self,
1271 mask: Option<&NullBuffer>,
1272 child: &ArrayData,
1273 ) -> Result<(), ArrowError> {
1274 let mask = match mask {
1275 Some(mask) => mask,
1276 None => {
1277 return match child.null_count() {
1278 0 => Ok(()),
1279 _ => Err(ArrowError::InvalidArgumentError(format!(
1280 "non-nullable child of type {} contains nulls not present in parent {}",
1281 child.data_type, self.data_type
1282 ))),
1283 }
1284 }
1285 };
1286
1287 match child.nulls() {
1288 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1289 "non-nullable child of type {} contains nulls not present in parent",
1290 child.data_type
1291 ))),
1292 _ => Ok(()),
1293 }
1294 }
1295
1296 pub fn validate_values(&self) -> Result<(), ArrowError> {
1302 match &self.data_type {
1303 DataType::Utf8 => self.validate_utf8::<i32>(),
1304 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1305 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1306 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1307 DataType::BinaryView => {
1308 let views = self.typed_buffer::<u128>(0, self.len)?;
1309 validate_binary_view(views, &self.buffers[1..])
1310 }
1311 DataType::Utf8View => {
1312 let views = self.typed_buffer::<u128>(0, self.len)?;
1313 validate_string_view(views, &self.buffers[1..])
1314 }
1315 DataType::List(_) | DataType::Map(_, _) => {
1316 let child = &self.child_data[0];
1317 self.validate_offsets_full::<i32>(child.len)
1318 }
1319 DataType::LargeList(_) => {
1320 let child = &self.child_data[0];
1321 self.validate_offsets_full::<i64>(child.len)
1322 }
1323 DataType::Union(_, _) => {
1324 Ok(())
1330 }
1331 DataType::Dictionary(key_type, _value_type) => {
1332 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1333 let max_value = dictionary_length - 1;
1334 match key_type.as_ref() {
1335 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1336 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1337 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1338 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1339 DataType::Int8 => self.check_bounds::<i8>(max_value),
1340 DataType::Int16 => self.check_bounds::<i16>(max_value),
1341 DataType::Int32 => self.check_bounds::<i32>(max_value),
1342 DataType::Int64 => self.check_bounds::<i64>(max_value),
1343 _ => unreachable!(),
1344 }
1345 }
1346 DataType::RunEndEncoded(run_ends, _values) => {
1347 let run_ends_data = self.child_data()[0].clone();
1348 match run_ends.data_type() {
1349 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1350 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1351 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1352 _ => unreachable!(),
1353 }
1354 }
1355 _ => {
1356 Ok(())
1358 }
1359 }
1360 }
1361
1362 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1373 where
1374 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1375 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1376 {
1377 self.typed_offsets::<T>()?
1378 .iter()
1379 .enumerate()
1380 .map(|(i, x)| {
1381 let r = x.to_usize().ok_or_else(|| {
1383 ArrowError::InvalidArgumentError(format!(
1384 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1385 );
1386 match r {
1388 Ok(n) if n <= offset_limit => Ok((i, n)),
1389 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1390 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1391 ),
1392 Err(e) => Err(e),
1393 }
1394 })
1395 .scan(0_usize, |start, end| {
1396 match end {
1398 Ok((i, end)) if *start <= end => {
1399 let range = Some(Ok((i, *start..end)));
1400 *start = end;
1401 range
1402 }
1403 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1404 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1405 i - 1, start, end))
1406 )),
1407 Err(err) => Some(Err(err)),
1408 }
1409 })
1410 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1412 let (item_index, range) = res?;
1413 validate(item_index-1, range)
1414 })
1415 }
1416
1417 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1420 where
1421 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1422 {
1423 let values_buffer = &self.buffers[1].as_slice();
1424 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1425 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1427 if !values_str.is_char_boundary(range.start)
1428 || !values_str.is_char_boundary(range.end)
1429 {
1430 return Err(ArrowError::InvalidArgumentError(format!(
1431 "incomplete utf-8 byte sequence from index {string_index}"
1432 )));
1433 }
1434 Ok(())
1435 })
1436 } else {
1437 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1439 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1440 ArrowError::InvalidArgumentError(format!(
1441 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1442 ))
1443 })?;
1444 Ok(())
1445 })
1446 }
1447 }
1448
1449 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1452 where
1453 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1454 {
1455 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1456 Ok(())
1459 })
1460 }
1461
1462 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1465 where
1466 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1467 {
1468 let required_len = self.len + self.offset;
1469 let buffer = &self.buffers[0];
1470
1471 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1474
1475 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1477
1478 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1479 if self.is_null(i) {
1481 return Ok(());
1482 }
1483 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1484 ArrowError::InvalidArgumentError(format!(
1485 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1486 ))
1487 })?;
1488
1489 if dict_index < 0 || dict_index > max_value {
1490 return Err(ArrowError::InvalidArgumentError(format!(
1491 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1492 )));
1493 }
1494 Ok(())
1495 })
1496 }
1497
1498 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1500 where
1501 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1502 {
1503 let values = self.typed_buffer::<T>(0, self.len)?;
1504 let mut prev_value: i64 = 0_i64;
1505 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1506 let value: i64 = inp_value.try_into().map_err(|_| {
1507 ArrowError::InvalidArgumentError(format!(
1508 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1509 ))
1510 })?;
1511 if value <= 0_i64 {
1512 return Err(ArrowError::InvalidArgumentError(format!(
1513 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1514 )));
1515 }
1516 if ix > 0 && value <= prev_value {
1517 return Err(ArrowError::InvalidArgumentError(format!(
1518 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1519 )));
1520 }
1521
1522 prev_value = value;
1523 Ok(())
1524 })?;
1525
1526 if prev_value.as_usize() < (self.offset + self.len) {
1527 return Err(ArrowError::InvalidArgumentError(format!(
1528 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1529 self.offset + self.len
1530 )));
1531 }
1532 Ok(())
1533 }
1534
1535 pub fn ptr_eq(&self, other: &Self) -> bool {
1539 if self.offset != other.offset
1540 || self.len != other.len
1541 || self.data_type != other.data_type
1542 || self.buffers.len() != other.buffers.len()
1543 || self.child_data.len() != other.child_data.len()
1544 {
1545 return false;
1546 }
1547
1548 match (&self.nulls, &other.nulls) {
1549 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1550 (Some(_), None) | (None, Some(_)) => return false,
1551 _ => {}
1552 };
1553
1554 if !self
1555 .buffers
1556 .iter()
1557 .zip(other.buffers.iter())
1558 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1559 {
1560 return false;
1561 }
1562
1563 self.child_data
1564 .iter()
1565 .zip(other.child_data.iter())
1566 .all(|(a, b)| a.ptr_eq(b))
1567 }
1568
1569 pub fn into_builder(self) -> ArrayDataBuilder {
1571 self.into()
1572 }
1573}
1574
1575pub fn layout(data_type: &DataType) -> DataTypeLayout {
1578 use arrow_schema::IntervalUnit::*;
1581
1582 match data_type {
1583 DataType::Null => DataTypeLayout {
1584 buffers: vec![],
1585 can_contain_null_mask: false,
1586 variadic: false,
1587 },
1588 DataType::Boolean => DataTypeLayout {
1589 buffers: vec![BufferSpec::BitMap],
1590 can_contain_null_mask: true,
1591 variadic: false,
1592 },
1593 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1594 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1595 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1596 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1597 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1598 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1599 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1600 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1601 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1602 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1603 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1604 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1605 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1606 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1607 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1608 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1609 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1610 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1611 DataType::Interval(MonthDayNano) => {
1612 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1613 }
1614 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1615 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1616 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1617 DataType::FixedSizeBinary(size) => {
1618 let spec = BufferSpec::FixedWidth {
1619 byte_width: (*size).try_into().unwrap(),
1620 alignment: mem::align_of::<u8>(),
1621 };
1622 DataTypeLayout {
1623 buffers: vec![spec],
1624 can_contain_null_mask: true,
1625 variadic: false,
1626 }
1627 }
1628 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1629 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1630 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1631 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1632 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1633 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1635 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1636 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1637 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1638 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1639 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1642 let type_ids = BufferSpec::FixedWidth {
1643 byte_width: mem::size_of::<i8>(),
1644 alignment: mem::align_of::<i8>(),
1645 };
1646
1647 DataTypeLayout {
1648 buffers: match mode {
1649 UnionMode::Sparse => {
1650 vec![type_ids]
1651 }
1652 UnionMode::Dense => {
1653 vec![
1654 type_ids,
1655 BufferSpec::FixedWidth {
1656 byte_width: mem::size_of::<i32>(),
1657 alignment: mem::align_of::<i32>(),
1658 },
1659 ]
1660 }
1661 },
1662 can_contain_null_mask: false,
1663 variadic: false,
1664 }
1665 }
1666 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1667 }
1668}
1669
1670#[derive(Debug, PartialEq, Eq)]
1672pub struct DataTypeLayout {
1674 pub buffers: Vec<BufferSpec>,
1676
1677 pub can_contain_null_mask: bool,
1679
1680 pub variadic: bool,
1684}
1685
1686impl DataTypeLayout {
1687 pub fn new_fixed_width<T>() -> Self {
1689 Self {
1690 buffers: vec![BufferSpec::FixedWidth {
1691 byte_width: mem::size_of::<T>(),
1692 alignment: mem::align_of::<T>(),
1693 }],
1694 can_contain_null_mask: true,
1695 variadic: false,
1696 }
1697 }
1698
1699 pub fn new_nullable_empty() -> Self {
1702 Self {
1703 buffers: vec![],
1704 can_contain_null_mask: true,
1705 variadic: false,
1706 }
1707 }
1708
1709 pub fn new_empty() -> Self {
1712 Self {
1713 buffers: vec![],
1714 can_contain_null_mask: false,
1715 variadic: false,
1716 }
1717 }
1718
1719 pub fn new_binary<T>() -> Self {
1723 Self {
1724 buffers: vec![
1725 BufferSpec::FixedWidth {
1727 byte_width: mem::size_of::<T>(),
1728 alignment: mem::align_of::<T>(),
1729 },
1730 BufferSpec::VariableWidth,
1732 ],
1733 can_contain_null_mask: true,
1734 variadic: false,
1735 }
1736 }
1737
1738 pub fn new_view() -> Self {
1740 Self {
1741 buffers: vec![BufferSpec::FixedWidth {
1742 byte_width: mem::size_of::<u128>(),
1743 alignment: mem::align_of::<u128>(),
1744 }],
1745 can_contain_null_mask: true,
1746 variadic: true,
1747 }
1748 }
1749
1750 pub fn new_list_view<T>() -> Self {
1752 Self {
1753 buffers: vec![
1754 BufferSpec::FixedWidth {
1755 byte_width: mem::size_of::<T>(),
1756 alignment: mem::align_of::<T>(),
1757 },
1758 BufferSpec::FixedWidth {
1759 byte_width: mem::size_of::<T>(),
1760 alignment: mem::align_of::<T>(),
1761 },
1762 ],
1763 can_contain_null_mask: true,
1764 variadic: true,
1765 }
1766 }
1767}
1768
1769#[derive(Debug, PartialEq, Eq)]
1771pub enum BufferSpec {
1772 FixedWidth {
1783 byte_width: usize,
1785 alignment: usize,
1787 },
1788 VariableWidth,
1790 BitMap,
1796 #[allow(dead_code)]
1799 AlwaysNull,
1800}
1801
1802impl PartialEq for ArrayData {
1803 fn eq(&self, other: &Self) -> bool {
1804 equal::equal(self, other)
1805 }
1806}
1807
1808#[derive(Debug, Clone)]
1827#[doc(hidden)]
1828pub struct UnsafeFlag(bool);
1829
1830impl UnsafeFlag {
1831 #[inline]
1835 pub const fn new() -> Self {
1836 Self(false)
1837 }
1838
1839 #[inline]
1849 pub unsafe fn set(&mut self, val: bool) {
1850 self.0 = val;
1851 }
1852
1853 #[inline]
1855 pub fn get(&self) -> bool {
1856 self.0
1857 }
1858}
1859
1860impl Default for UnsafeFlag {
1862 fn default() -> Self {
1863 Self::new()
1864 }
1865}
1866
1867#[derive(Debug)]
1869pub struct ArrayDataBuilder {
1870 data_type: DataType,
1871 len: usize,
1872 null_count: Option<usize>,
1873 null_bit_buffer: Option<Buffer>,
1874 nulls: Option<NullBuffer>,
1875 offset: usize,
1876 buffers: Vec<Buffer>,
1877 child_data: Vec<ArrayData>,
1878 align_buffers: bool,
1882 skip_validation: UnsafeFlag,
1892}
1893
1894impl ArrayDataBuilder {
1895 #[inline]
1896 pub const fn new(data_type: DataType) -> Self {
1898 Self {
1899 data_type,
1900 len: 0,
1901 null_count: None,
1902 null_bit_buffer: None,
1903 nulls: None,
1904 offset: 0,
1905 buffers: vec![],
1906 child_data: vec![],
1907 align_buffers: false,
1908 skip_validation: UnsafeFlag::new(),
1909 }
1910 }
1911
1912 pub fn data_type(self, data_type: DataType) -> Self {
1914 Self { data_type, ..self }
1915 }
1916
1917 #[inline]
1918 #[allow(clippy::len_without_is_empty)]
1919 pub const fn len(mut self, n: usize) -> Self {
1921 self.len = n;
1922 self
1923 }
1924
1925 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1927 self.nulls = nulls;
1928 self.null_count = None;
1929 self.null_bit_buffer = None;
1930 self
1931 }
1932
1933 pub fn null_count(mut self, null_count: usize) -> Self {
1935 self.null_count = Some(null_count);
1936 self
1937 }
1938
1939 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1941 self.nulls = None;
1942 self.null_bit_buffer = buf;
1943 self
1944 }
1945
1946 #[inline]
1948 pub const fn offset(mut self, n: usize) -> Self {
1949 self.offset = n;
1950 self
1951 }
1952
1953 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1955 self.buffers = v;
1956 self
1957 }
1958
1959 pub fn add_buffer(mut self, b: Buffer) -> Self {
1961 self.buffers.push(b);
1962 self
1963 }
1964
1965 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1967 self.buffers.extend(bs);
1968 self
1969 }
1970
1971 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1973 self.child_data = v;
1974 self
1975 }
1976
1977 pub fn add_child_data(mut self, r: ArrayData) -> Self {
1979 self.child_data.push(r);
1980 self
1981 }
1982
1983 pub unsafe fn build_unchecked(self) -> ArrayData {
1998 self.skip_validation(true).build().unwrap()
1999 }
2000
2001 pub fn build(self) -> Result<ArrayData, ArrowError> {
2010 let Self {
2011 data_type,
2012 len,
2013 null_count,
2014 null_bit_buffer,
2015 nulls,
2016 offset,
2017 buffers,
2018 child_data,
2019 align_buffers,
2020 skip_validation,
2021 } = self;
2022
2023 let nulls = nulls
2024 .or_else(|| {
2025 let buffer = null_bit_buffer?;
2026 let buffer = BooleanBuffer::new(buffer, offset, len);
2027 Some(match null_count {
2028 Some(n) => {
2029 unsafe { NullBuffer::new_unchecked(buffer, n) }
2031 }
2032 None => NullBuffer::new(buffer),
2033 })
2034 })
2035 .filter(|b| b.null_count() != 0);
2036
2037 let mut data = ArrayData {
2038 data_type,
2039 len,
2040 offset,
2041 buffers,
2042 child_data,
2043 nulls,
2044 };
2045
2046 if align_buffers {
2047 data.align_buffers();
2048 }
2049
2050 if !skip_validation.get() || cfg!(feature = "force_validate") {
2052 data.validate_data()?;
2053 }
2054 Ok(data)
2055 }
2056
2057 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2059 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2060 self.align_buffers(true).build()
2061 }
2062
2063 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2079 self.align_buffers = align_buffers;
2080 self
2081 }
2082
2083 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2097 self.skip_validation.set(skip_validation);
2098 self
2099 }
2100}
2101
2102impl From<ArrayData> for ArrayDataBuilder {
2103 fn from(d: ArrayData) -> Self {
2104 Self {
2105 data_type: d.data_type,
2106 len: d.len,
2107 offset: d.offset,
2108 buffers: d.buffers,
2109 child_data: d.child_data,
2110 nulls: d.nulls,
2111 null_bit_buffer: None,
2112 null_count: None,
2113 align_buffers: false,
2114 skip_validation: UnsafeFlag::new(),
2115 }
2116 }
2117}
2118
2119#[cfg(test)]
2120mod tests {
2121 use super::*;
2122 use arrow_schema::{Field, Fields};
2123
2124 fn make_i32_buffer(n: usize) -> Buffer {
2128 Buffer::from_slice_ref(vec![42i32; n])
2129 }
2130
2131 fn make_f32_buffer(n: usize) -> Buffer {
2133 Buffer::from_slice_ref(vec![42f32; n])
2134 }
2135
2136 #[test]
2137 fn test_builder() {
2138 let v = (0..25).collect::<Vec<i32>>();
2140 let b1 = Buffer::from_slice_ref(&v);
2141 let arr_data = ArrayData::builder(DataType::Int32)
2142 .len(20)
2143 .offset(5)
2144 .add_buffer(b1)
2145 .null_bit_buffer(Some(Buffer::from([
2146 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2147 ])))
2148 .build()
2149 .unwrap();
2150
2151 assert_eq!(20, arr_data.len());
2152 assert_eq!(10, arr_data.null_count());
2153 assert_eq!(5, arr_data.offset());
2154 assert_eq!(1, arr_data.buffers().len());
2155 assert_eq!(
2156 Buffer::from_slice_ref(&v).as_slice(),
2157 arr_data.buffers()[0].as_slice()
2158 );
2159 }
2160
2161 #[test]
2162 fn test_builder_with_child_data() {
2163 let child_arr_data = ArrayData::try_new(
2164 DataType::Int32,
2165 5,
2166 None,
2167 0,
2168 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2169 vec![],
2170 )
2171 .unwrap();
2172
2173 let field = Arc::new(Field::new("x", DataType::Int32, true));
2174 let data_type = DataType::Struct(vec![field].into());
2175
2176 let arr_data = ArrayData::builder(data_type)
2177 .len(5)
2178 .offset(0)
2179 .add_child_data(child_arr_data.clone())
2180 .build()
2181 .unwrap();
2182
2183 assert_eq!(5, arr_data.len());
2184 assert_eq!(1, arr_data.child_data().len());
2185 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2186 }
2187
2188 #[test]
2189 fn test_null_count() {
2190 let mut bit_v: [u8; 2] = [0; 2];
2191 bit_util::set_bit(&mut bit_v, 0);
2192 bit_util::set_bit(&mut bit_v, 3);
2193 bit_util::set_bit(&mut bit_v, 10);
2194 let arr_data = ArrayData::builder(DataType::Int32)
2195 .len(16)
2196 .add_buffer(make_i32_buffer(16))
2197 .null_bit_buffer(Some(Buffer::from(bit_v)))
2198 .build()
2199 .unwrap();
2200 assert_eq!(13, arr_data.null_count());
2201
2202 let mut bit_v: [u8; 2] = [0; 2];
2204 bit_util::set_bit(&mut bit_v, 0);
2205 bit_util::set_bit(&mut bit_v, 3);
2206 bit_util::set_bit(&mut bit_v, 10);
2207 let arr_data = ArrayData::builder(DataType::Int32)
2208 .len(12)
2209 .offset(2)
2210 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2212 .build()
2213 .unwrap();
2214 assert_eq!(10, arr_data.null_count());
2215 }
2216
2217 #[test]
2218 fn test_null_buffer_ref() {
2219 let mut bit_v: [u8; 2] = [0; 2];
2220 bit_util::set_bit(&mut bit_v, 0);
2221 bit_util::set_bit(&mut bit_v, 3);
2222 bit_util::set_bit(&mut bit_v, 10);
2223 let arr_data = ArrayData::builder(DataType::Int32)
2224 .len(16)
2225 .add_buffer(make_i32_buffer(16))
2226 .null_bit_buffer(Some(Buffer::from(bit_v)))
2227 .build()
2228 .unwrap();
2229 assert!(arr_data.nulls().is_some());
2230 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2231 }
2232
2233 #[test]
2234 fn test_slice() {
2235 let mut bit_v: [u8; 2] = [0; 2];
2236 bit_util::set_bit(&mut bit_v, 0);
2237 bit_util::set_bit(&mut bit_v, 3);
2238 bit_util::set_bit(&mut bit_v, 10);
2239 let data = ArrayData::builder(DataType::Int32)
2240 .len(16)
2241 .add_buffer(make_i32_buffer(16))
2242 .null_bit_buffer(Some(Buffer::from(bit_v)))
2243 .build()
2244 .unwrap();
2245 let new_data = data.slice(1, 15);
2246 assert_eq!(data.len() - 1, new_data.len());
2247 assert_eq!(1, new_data.offset());
2248 assert_eq!(data.null_count(), new_data.null_count());
2249
2250 let new_data = new_data.slice(1, 14);
2252 assert_eq!(data.len() - 2, new_data.len());
2253 assert_eq!(2, new_data.offset());
2254 assert_eq!(data.null_count() - 1, new_data.null_count());
2255 }
2256
2257 #[test]
2258 fn test_equality() {
2259 let int_data = ArrayData::builder(DataType::Int32)
2260 .len(1)
2261 .add_buffer(make_i32_buffer(1))
2262 .build()
2263 .unwrap();
2264
2265 let float_data = ArrayData::builder(DataType::Float32)
2266 .len(1)
2267 .add_buffer(make_f32_buffer(1))
2268 .build()
2269 .unwrap();
2270 assert_ne!(int_data, float_data);
2271 assert!(!int_data.ptr_eq(&float_data));
2272 assert!(int_data.ptr_eq(&int_data));
2273
2274 #[allow(clippy::redundant_clone)]
2275 let int_data_clone = int_data.clone();
2276 assert_eq!(int_data, int_data_clone);
2277 assert!(int_data.ptr_eq(&int_data_clone));
2278 assert!(int_data_clone.ptr_eq(&int_data));
2279
2280 let int_data_slice = int_data_clone.slice(1, 0);
2281 assert!(int_data_slice.ptr_eq(&int_data_slice));
2282 assert!(!int_data.ptr_eq(&int_data_slice));
2283 assert!(!int_data_slice.ptr_eq(&int_data));
2284
2285 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2286 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2287 let string_data = ArrayData::try_new(
2288 DataType::Utf8,
2289 3,
2290 Some(Buffer::from_iter(vec![true, false, true])),
2291 0,
2292 vec![offsets_buffer, data_buffer],
2293 vec![],
2294 )
2295 .unwrap();
2296
2297 assert_ne!(float_data, string_data);
2298 assert!(!float_data.ptr_eq(&string_data));
2299
2300 assert!(string_data.ptr_eq(&string_data));
2301
2302 #[allow(clippy::redundant_clone)]
2303 let string_data_cloned = string_data.clone();
2304 assert!(string_data_cloned.ptr_eq(&string_data));
2305 assert!(string_data.ptr_eq(&string_data_cloned));
2306
2307 let string_data_slice = string_data.slice(1, 2);
2308 assert!(string_data_slice.ptr_eq(&string_data_slice));
2309 assert!(!string_data_slice.ptr_eq(&string_data))
2310 }
2311
2312 #[test]
2313 fn test_slice_memory_size() {
2314 let mut bit_v: [u8; 2] = [0; 2];
2315 bit_util::set_bit(&mut bit_v, 0);
2316 bit_util::set_bit(&mut bit_v, 3);
2317 bit_util::set_bit(&mut bit_v, 10);
2318 let data = ArrayData::builder(DataType::Int32)
2319 .len(16)
2320 .add_buffer(make_i32_buffer(16))
2321 .null_bit_buffer(Some(Buffer::from(bit_v)))
2322 .build()
2323 .unwrap();
2324 let new_data = data.slice(1, 14);
2325 assert_eq!(
2326 data.get_slice_memory_size().unwrap() - 8,
2327 new_data.get_slice_memory_size().unwrap()
2328 );
2329 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2330 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2331 let string_data = ArrayData::try_new(
2332 DataType::Utf8,
2333 3,
2334 Some(Buffer::from_iter(vec![true, false, true])),
2335 0,
2336 vec![offsets_buffer, data_buffer],
2337 vec![],
2338 )
2339 .unwrap();
2340 let string_data_slice = string_data.slice(1, 2);
2341 assert_eq!(
2343 string_data.get_slice_memory_size().unwrap() - 6,
2344 string_data_slice.get_slice_memory_size().unwrap()
2345 );
2346 }
2347
2348 #[test]
2349 fn test_count_nulls() {
2350 let buffer = Buffer::from([0b00010110, 0b10011111]);
2351 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2352 let count = count_nulls(Some(&buffer), 0, 16);
2353 assert_eq!(count, 7);
2354
2355 let count = count_nulls(Some(&buffer), 4, 8);
2356 assert_eq!(count, 3);
2357 }
2358
2359 #[test]
2360 fn test_contains_nulls() {
2361 let buffer: Buffer =
2362 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2363 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2364 assert!(contains_nulls(Some(&buffer), 0, 6));
2365 assert!(contains_nulls(Some(&buffer), 0, 3));
2366 assert!(!contains_nulls(Some(&buffer), 3, 2));
2367 assert!(!contains_nulls(Some(&buffer), 0, 0));
2368 }
2369
2370 #[test]
2371 fn test_alignment() {
2372 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2373 let sliced = buffer.slice(1);
2374
2375 let mut data = ArrayData {
2376 data_type: DataType::Int32,
2377 len: 0,
2378 offset: 0,
2379 buffers: vec![buffer],
2380 child_data: vec![],
2381 nulls: None,
2382 };
2383 data.validate_full().unwrap();
2384
2385 data.buffers[0] = sliced;
2387 let err = data.validate().unwrap_err();
2388
2389 assert_eq!(
2390 err.to_string(),
2391 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2392 );
2393
2394 data.align_buffers();
2395 data.validate_full().unwrap();
2396 }
2397
2398 #[test]
2399 fn test_alignment_struct() {
2400 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2401 let sliced = buffer.slice(1);
2402
2403 let child_data = ArrayData {
2404 data_type: DataType::Int32,
2405 len: 0,
2406 offset: 0,
2407 buffers: vec![buffer],
2408 child_data: vec![],
2409 nulls: None,
2410 };
2411
2412 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2413 let mut data = ArrayData {
2414 data_type: schema,
2415 len: 0,
2416 offset: 0,
2417 buffers: vec![],
2418 child_data: vec![child_data],
2419 nulls: None,
2420 };
2421 data.validate_full().unwrap();
2422
2423 data.child_data[0].buffers[0] = sliced;
2425 let err = data.validate().unwrap_err();
2426
2427 assert_eq!(
2428 err.to_string(),
2429 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2430 );
2431
2432 data.align_buffers();
2433 data.validate_full().unwrap();
2434 }
2435
2436 #[test]
2437 fn test_null_view_types() {
2438 let array_len = 32;
2439 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2440 assert_eq!(array.len(), array_len);
2441 for i in 0..array.len() {
2442 assert!(array.is_null(i));
2443 }
2444
2445 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2446 assert_eq!(array.len(), array_len);
2447 for i in 0..array.len() {
2448 assert!(array.is_null(i));
2449 }
2450 }
2451}