Skip to main content

arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            if *size < 0 {
138                panic!("cannot construct buffers from FixedSizeBinary({size})");
139            }
140            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
141        }
142        DataType::Dictionary(k, _) => [
143            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
144            empty_buffer,
145        ],
146        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
147            [empty_buffer, MutableBuffer::new(0)]
148        }
149        DataType::Union(_, mode) => {
150            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
151            match mode {
152                UnionMode::Sparse => [type_ids, empty_buffer],
153                UnionMode::Dense => {
154                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
155                    [type_ids, offsets]
156                }
157            }
158        }
159    }
160}
161
162/// A generic representation of Arrow array data which encapsulates common attributes
163/// and operations for Arrow array.
164///
165/// Specific operations for different arrays types (e.g., primitive, list, struct)
166/// are implemented in `Array`.
167///
168/// # Memory Layout
169///
170/// `ArrayData` has references to one or more underlying data buffers
171/// and optional child ArrayData, depending on type as illustrated
172/// below. Bitmaps are not shown for simplicity but they are stored
173/// similarly to the buffers.
174///
175/// ```text
176///                        offset
177///                       points to
178/// ┌───────────────────┐ start of  ┌───────┐       Different
179/// │                   │   data    │       │     ArrayData may
180/// │ArrayData {        │           │....   │     also refers to
181/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
182/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
183/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
184/// │  buffers: [       │           │5882   │◀─
185/// │    ...            │  │        │4323   │
186/// │  ]                │   ─ ─ ─ ─▶│4859   │
187/// │  child_data: [    │           │....   │
188/// │    ...            │           │       │
189/// │  ]                │           └───────┘
190/// │}                  │
191/// │                   │            Shared Buffer uses
192/// │               │   │            bytes::Bytes to hold
193/// └───────────────────┘            actual data values
194///           ┌ ─ ─ ┘
195///
196///           ▼
197/// ┌───────────────────┐
198/// │ArrayData {        │
199/// │  ...              │
200/// │}                  │
201/// │                   │
202/// └───────────────────┘
203///
204/// Child ArrayData may also have its own buffers and children
205/// ```
206
207#[derive(Debug, Clone)]
208pub struct ArrayData {
209    /// The data type
210    data_type: DataType,
211
212    /// The number of elements
213    len: usize,
214
215    /// The offset in number of items (not bytes).
216    ///
217    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
218    /// does NOT apply to [`Self::nulls`].
219    offset: usize,
220
221    /// The buffers that store the actual data for this array, as defined
222    /// in the [Arrow Spec].
223    ///
224    /// Depending on the array types, [`Self::buffers`] can hold different
225    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
226    /// positions.
227    ///
228    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
229    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
230    /// we need to satisfy `len`.
231    ///
232    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
233    buffers: Vec<Buffer>,
234
235    /// The child(ren) of this array.
236    ///
237    /// Only non-empty for nested types, such as `ListArray` and
238    /// `StructArray`.
239    ///
240    /// The first logical element in each child element begins at `offset`.
241    ///
242    /// If the child element also has an offset then these offsets are
243    /// cumulative.
244    child_data: Vec<ArrayData>,
245
246    /// The null bitmap.
247    ///
248    /// `None` indicates all values are non-null in this array.
249    ///
250    /// [`Self::offset]` does not apply to the null bitmap. While the
251    /// BooleanBuffer may be sliced (have its own offset) internally, this
252    /// `NullBuffer` always represents exactly `len` elements.
253    nulls: Option<NullBuffer>,
254}
255
256/// A thread-safe, shared reference to the Arrow array data.
257pub type ArrayDataRef = Arc<ArrayData>;
258
259fn checked_len_plus_offset(
260    data_type: &DataType,
261    len: usize,
262    offset: usize,
263) -> Result<usize, ArrowError> {
264    len.checked_add(offset).ok_or_else(|| {
265        ArrowError::InvalidArgumentError(format!(
266            "Length {len} with offset {offset} overflows usize for {data_type}"
267        ))
268    })
269}
270
271impl ArrayData {
272    /// Create a new ArrayData instance;
273    ///
274    /// If `null_count` is not specified, the number of nulls in
275    /// null_bit_buffer is calculated.
276    ///
277    /// If the number of nulls is 0 then the null_bit_buffer
278    /// is set to `None`.
279    ///
280    /// # Safety
281    ///
282    /// The input values *must* form a valid Arrow array for
283    /// `data_type`, or undefined behavior can result.
284    ///
285    /// Note: This is a low level API and most users of the arrow
286    /// crate should create arrays using the methods in the `array`
287    /// module.
288    pub unsafe fn new_unchecked(
289        data_type: DataType,
290        len: usize,
291        null_count: Option<usize>,
292        null_bit_buffer: Option<Buffer>,
293        offset: usize,
294        buffers: Vec<Buffer>,
295        child_data: Vec<ArrayData>,
296    ) -> Self {
297        let mut skip_validation = UnsafeFlag::new();
298        // SAFETY: caller responsible for ensuring data is valid
299        unsafe { skip_validation.set(true) };
300
301        ArrayDataBuilder {
302            data_type,
303            len,
304            null_count,
305            null_bit_buffer,
306            nulls: None,
307            offset,
308            buffers,
309            child_data,
310            align_buffers: false,
311            skip_validation,
312        }
313        .build()
314        .unwrap()
315    }
316
317    /// Create a new ArrayData, validating that the provided buffers form a valid
318    /// Arrow array of the specified data type.
319    ///
320    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
321    /// is set to `None`.
322    ///
323    /// Internally this calls through to [`Self::validate_data`]
324    ///
325    /// Note: This is a low level API and most users of the arrow crate should create
326    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
327    /// or [`ArrayDataBuilder`].
328    ///
329    /// See also [`Self::into_parts`] to recover the fields
330    pub fn try_new(
331        data_type: DataType,
332        len: usize,
333        null_bit_buffer: Option<Buffer>,
334        offset: usize,
335        buffers: Vec<Buffer>,
336        child_data: Vec<ArrayData>,
337    ) -> Result<Self, ArrowError> {
338        // we must check the length of `null_bit_buffer` first
339        // because we use this buffer to calculate `null_count`
340        // in `Self::new_unchecked`.
341        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
342            let len_plus_offset = checked_len_plus_offset(&data_type, len, offset)?;
343            let needed_len = bit_util::ceil(len_plus_offset, 8);
344            if null_bit_buffer.len() < needed_len {
345                return Err(ArrowError::InvalidArgumentError(format!(
346                    "null_bit_buffer size too small. got {} needed {}",
347                    null_bit_buffer.len(),
348                    needed_len
349                )));
350            }
351        }
352        // Safety justification: `validate_full` is called below
353        let new_self = unsafe {
354            Self::new_unchecked(
355                data_type,
356                len,
357                None,
358                null_bit_buffer,
359                offset,
360                buffers,
361                child_data,
362            )
363        };
364
365        // As the data is not trusted, do a full validation of its contents
366        // We don't need to validate children as we can assume that the
367        // [`ArrayData`] in `child_data` have already been validated through
368        // a call to `ArrayData::try_new` or created using unsafe
369        new_self.validate_data()?;
370        Ok(new_self)
371    }
372
373    /// Return the constituent parts of this ArrayData
374    ///
375    /// This is the inverse of [`ArrayData::try_new`].
376    ///
377    /// Returns `(data_type, len, nulls, offset, buffers, child_data)`
378    pub fn into_parts(
379        self,
380    ) -> (
381        DataType,
382        usize,
383        Option<NullBuffer>,
384        usize,
385        Vec<Buffer>,
386        Vec<ArrayData>,
387    ) {
388        let Self {
389            data_type,
390            len,
391            nulls,
392            offset,
393            buffers,
394            child_data,
395        } = self;
396
397        (data_type, len, nulls, offset, buffers, child_data)
398    }
399
400    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
401    #[inline]
402    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
403        ArrayDataBuilder::new(data_type)
404    }
405
406    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
407    #[inline]
408    pub const fn data_type(&self) -> &DataType {
409        &self.data_type
410    }
411
412    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
413    pub fn buffers(&self) -> &[Buffer] {
414        &self.buffers
415    }
416
417    /// Returns a slice of children [`ArrayData`]. This will be non
418    /// empty for type such as lists and structs.
419    pub fn child_data(&self) -> &[ArrayData] {
420        &self.child_data[..]
421    }
422
423    /// Returns whether the element at index `i` is null
424    #[inline]
425    pub fn is_null(&self, i: usize) -> bool {
426        match &self.nulls {
427            Some(v) => v.is_null(i),
428            None => false,
429        }
430    }
431
432    /// Returns a reference to the null buffer of this [`ArrayData`] if any
433    ///
434    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
435    #[inline]
436    pub fn nulls(&self) -> Option<&NullBuffer> {
437        self.nulls.as_ref()
438    }
439
440    /// Returns whether the element at index `i` is not null
441    #[inline]
442    pub fn is_valid(&self, i: usize) -> bool {
443        !self.is_null(i)
444    }
445
446    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
447    #[inline]
448    pub const fn len(&self) -> usize {
449        self.len
450    }
451
452    /// Returns whether this [`ArrayData`] is empty
453    #[inline]
454    pub const fn is_empty(&self) -> bool {
455        self.len == 0
456    }
457
458    /// Returns the offset of this [`ArrayData`]
459    #[inline]
460    pub const fn offset(&self) -> usize {
461        self.offset
462    }
463
464    /// Returns the total number of nulls in this array
465    #[inline]
466    pub fn null_count(&self) -> usize {
467        self.nulls
468            .as_ref()
469            .map(|x| x.null_count())
470            .unwrap_or_default()
471    }
472
473    /// Returns the total number of bytes of memory occupied by the
474    /// buffers owned by this [`ArrayData`] and all of its
475    /// children. (See also diagram on [`ArrayData`]).
476    ///
477    /// Note that this [`ArrayData`] may only refer to a subset of the
478    /// data in the underlying [`Buffer`]s (due to `offset` and
479    /// `length`), but the size returned includes the entire size of
480    /// the buffers.
481    ///
482    /// If multiple [`ArrayData`]s refer to the same underlying
483    /// [`Buffer`]s they will both report the same size.
484    pub fn get_buffer_memory_size(&self) -> usize {
485        let mut size = 0;
486        for buffer in &self.buffers {
487            size += buffer.capacity();
488        }
489        if let Some(bitmap) = &self.nulls {
490            size += bitmap.buffer().capacity()
491        }
492        for child in &self.child_data {
493            size += child.get_buffer_memory_size();
494        }
495        size
496    }
497
498    /// Returns the total number of the bytes of memory occupied by
499    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
500    ///
501    /// This is approximately the number of bytes if a new
502    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
503    /// exactly the data needed.
504    ///
505    /// For example, a [`DataType::Int64`] with `100` elements,
506    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
507    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
508    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
509    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
510    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
511        let mut result: usize = 0;
512        let layout = layout(&self.data_type);
513
514        for spec in layout.buffers.iter() {
515            match spec {
516                BufferSpec::FixedWidth { byte_width, .. } => {
517                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
518                        ArrowError::ComputeError(
519                            "Integer overflow computing buffer size".to_string(),
520                        )
521                    })?;
522                    result += buffer_size;
523                }
524                BufferSpec::VariableWidth => {
525                    let buffer_len = match self.data_type {
526                        DataType::Utf8 | DataType::Binary => {
527                            let offsets = self.typed_offsets::<i32>()?;
528                            (offsets[self.len] - offsets[0]) as usize
529                        }
530                        DataType::LargeUtf8 | DataType::LargeBinary => {
531                            let offsets = self.typed_offsets::<i64>()?;
532                            (offsets[self.len] - offsets[0]) as usize
533                        }
534                        _ => {
535                            return Err(ArrowError::NotYetImplemented(format!(
536                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
537                                self.data_type
538                            )));
539                        }
540                    };
541                    result += buffer_len;
542                }
543                BufferSpec::BitMap => {
544                    let buffer_size = bit_util::ceil(self.len, 8);
545                    result += buffer_size;
546                }
547                BufferSpec::AlwaysNull => {
548                    // Nothing to do
549                }
550            }
551        }
552
553        if self.nulls().is_some() {
554            result += bit_util::ceil(self.len, 8);
555        }
556
557        for child in &self.child_data {
558            result += child.get_slice_memory_size()?;
559        }
560        Ok(result)
561    }
562
563    /// Returns the total number of bytes of memory occupied
564    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
565    /// children. (See also diagram on [`ArrayData`]).
566    ///
567    /// Equivalent to:
568    ///  `size_of_val(self)` +
569    ///  [`Self::get_buffer_memory_size`] +
570    ///  `size_of_val(child)` for all children
571    pub fn get_array_memory_size(&self) -> usize {
572        let mut size = mem::size_of_val(self);
573
574        // Calculate rest of the fields top down which contain actual data
575        for buffer in &self.buffers {
576            size += mem::size_of::<Buffer>();
577            size += buffer.capacity();
578        }
579        if let Some(nulls) = &self.nulls {
580            size += nulls.buffer().capacity();
581        }
582        for child in &self.child_data {
583            size += child.get_array_memory_size();
584        }
585
586        size
587    }
588
589    /// Creates a zero-copy slice of itself. This creates a new
590    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
591    /// different offset and len
592    ///
593    /// # Panics
594    ///
595    /// Panics if `offset + length` overflows or is greater than `self.len()`.
596    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
597        let end = offset
598            .checked_add(length)
599            .expect("offset + length overflow");
600        assert!(end <= self.len());
601
602        if let DataType::Struct(_) = self.data_type() {
603            // Slice into children
604            let new_offset = self.offset + offset;
605            ArrayData {
606                data_type: self.data_type().clone(),
607                len: length,
608                offset: new_offset,
609                buffers: self.buffers.clone(),
610                // Slice child data, to propagate offsets down to them
611                child_data: self
612                    .child_data()
613                    .iter()
614                    .map(|data| data.slice(offset, length))
615                    .collect(),
616                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
617            }
618        } else {
619            let mut new_data = self.clone();
620
621            new_data.len = length;
622            new_data.offset = offset + self.offset;
623            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
624
625            new_data
626        }
627    }
628
629    /// Returns the `buffer` as a slice of type `T` starting at self.offset
630    ///
631    /// # Panics
632    /// This function panics if:
633    /// * the buffer is not byte-aligned with type T, or
634    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
635    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
636        &self.buffers()[buffer].typed_data()[self.offset..]
637    }
638
639    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
640    ///
641    /// # Panics
642    /// This function panics if:
643    /// * the datatype `data_type` has incorrect layout
644    pub fn new_null(data_type: &DataType, len: usize) -> Self {
645        let bit_len = bit_util::ceil(len, 8);
646        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
647
648        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
649            Some(width) => (vec![zeroed(width * len)], vec![], true),
650            None => match data_type {
651                DataType::Null => (vec![], vec![], false),
652                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
653                DataType::Binary | DataType::Utf8 => {
654                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
655                }
656                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
657                DataType::LargeBinary | DataType::LargeUtf8 => {
658                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
659                }
660                DataType::FixedSizeBinary(i) => {
661                    if *i < 0 {
662                        panic!("cannot construct null data from FixedSizeBinary({i})");
663                    }
664                    (vec![zeroed(*i as usize * len)], vec![], true)
665                }
666                DataType::List(f) | DataType::Map(f, _) => (
667                    vec![zeroed((len + 1) * 4)],
668                    vec![ArrayData::new_empty(f.data_type())],
669                    true,
670                ),
671                DataType::LargeList(f) => (
672                    vec![zeroed((len + 1) * 8)],
673                    vec![ArrayData::new_empty(f.data_type())],
674                    true,
675                ),
676                DataType::ListView(f) => (
677                    vec![zeroed(len * 4), zeroed(len * 4)],
678                    vec![ArrayData::new_empty(f.data_type())],
679                    true,
680                ),
681                DataType::LargeListView(f) => (
682                    vec![zeroed(len * 8), zeroed(len * 8)],
683                    vec![ArrayData::new_empty(f.data_type())],
684                    true,
685                ),
686                DataType::FixedSizeList(f, list_len) => (
687                    vec![],
688                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
689                    true,
690                ),
691                DataType::Struct(fields) => (
692                    vec![],
693                    fields
694                        .iter()
695                        .map(|f| Self::new_null(f.data_type(), len))
696                        .collect(),
697                    true,
698                ),
699                DataType::Dictionary(k, v) => (
700                    vec![zeroed(k.primitive_width().unwrap() * len)],
701                    vec![ArrayData::new_empty(v.as_ref())],
702                    true,
703                ),
704                DataType::Union(f, mode) => {
705                    let (id, _) = f.iter().next().unwrap();
706                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
707                    let buffers = match mode {
708                        UnionMode::Sparse => vec![ids],
709                        UnionMode::Dense => {
710                            let end_offset = i32::from_usize(len).unwrap();
711                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
712                        }
713                    };
714
715                    let children = f
716                        .iter()
717                        .enumerate()
718                        .map(|(idx, (_, f))| {
719                            if idx == 0 || *mode == UnionMode::Sparse {
720                                Self::new_null(f.data_type(), len)
721                            } else {
722                                Self::new_empty(f.data_type())
723                            }
724                        })
725                        .collect();
726
727                    (buffers, children, false)
728                }
729                DataType::RunEndEncoded(r, v) => {
730                    if len == 0 {
731                        // For empty arrays, create zero-length child arrays.
732                        let runs = ArrayData::new_empty(r.data_type());
733                        let values = ArrayData::new_empty(v.data_type());
734                        (vec![], vec![runs, values], false)
735                    } else {
736                        let runs = match r.data_type() {
737                            DataType::Int16 => {
738                                let i = i16::from_usize(len).expect("run overflow");
739                                Buffer::from_slice_ref([i])
740                            }
741                            DataType::Int32 => {
742                                let i = i32::from_usize(len).expect("run overflow");
743                                Buffer::from_slice_ref([i])
744                            }
745                            DataType::Int64 => {
746                                let i = i64::from_usize(len).expect("run overflow");
747                                Buffer::from_slice_ref([i])
748                            }
749                            dt => unreachable!("Invalid run ends data type {dt}"),
750                        };
751
752                        let builder = ArrayData::builder(r.data_type().clone())
753                            .len(1)
754                            .buffers(vec![runs]);
755
756                        // SAFETY:
757                        // Valid by construction
758                        let runs = unsafe { builder.build_unchecked() };
759                        (
760                            vec![],
761                            vec![runs, ArrayData::new_null(v.data_type(), 1)],
762                            false,
763                        )
764                    }
765                }
766                // Handled by Some(width) branch above
767                DataType::Int8
768                | DataType::Int16
769                | DataType::Int32
770                | DataType::Int64
771                | DataType::UInt8
772                | DataType::UInt16
773                | DataType::UInt32
774                | DataType::UInt64
775                | DataType::Float16
776                | DataType::Float32
777                | DataType::Float64
778                | DataType::Timestamp(_, _)
779                | DataType::Date32
780                | DataType::Date64
781                | DataType::Time32(_)
782                | DataType::Time64(_)
783                | DataType::Duration(_)
784                | DataType::Interval(_)
785                | DataType::Decimal32(_, _)
786                | DataType::Decimal64(_, _)
787                | DataType::Decimal128(_, _)
788                | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
789            },
790        };
791
792        let mut builder = ArrayDataBuilder::new(data_type.clone())
793            .len(len)
794            .buffers(buffers)
795            .child_data(child_data);
796
797        if has_nulls {
798            builder = builder.nulls(Some(NullBuffer::new_null(len)))
799        }
800
801        // SAFETY:
802        // Data valid by construction
803        unsafe { builder.build_unchecked() }
804    }
805
806    /// Returns a new empty [ArrayData] valid for `data_type`.
807    pub fn new_empty(data_type: &DataType) -> Self {
808        Self::new_null(data_type, 0)
809    }
810
811    /// Verifies that the buffers meet the minimum alignment requirements for the data type
812    ///
813    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
814    ///
815    /// This can be useful for when interacting with data sent over IPC or FFI, that may
816    /// not meet the minimum alignment requirements
817    ///
818    /// This also aligns buffers of children data
819    pub fn align_buffers(&mut self) {
820        let layout = layout(&self.data_type);
821        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
822            if let BufferSpec::FixedWidth { alignment, .. } = spec {
823                if buffer.as_ptr().align_offset(*alignment) != 0 {
824                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
825                }
826            }
827        }
828        // align children data recursively
829        for data in self.child_data.iter_mut() {
830            data.align_buffers()
831        }
832    }
833
834    /// "cheap" validation of an `ArrayData`. Ensures buffers are
835    /// sufficiently sized to store `len` + `offset` total elements of
836    /// `data_type` and performs other inexpensive consistency checks.
837    ///
838    /// This check is "cheap" in the sense that it does not validate the
839    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
840    /// are within the bounds of the values buffer).
841    ///
842    /// See [ArrayData::validate_data] to validate fully the offset content
843    /// and the validity of utf8 data
844    pub fn validate(&self) -> Result<(), ArrowError> {
845        // Need at least this much space in each buffer
846        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
847
848        // Check that the data layout conforms to the spec
849        let layout = layout(&self.data_type);
850
851        if !layout.can_contain_null_mask && self.nulls.is_some() {
852            return Err(ArrowError::InvalidArgumentError(format!(
853                "Arrays of type {:?} cannot contain a null bitmask",
854                self.data_type,
855            )));
856        }
857
858        // Check data buffers length for view types and other types
859        if self.buffers.len() < layout.buffers.len()
860            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
861        {
862            return Err(ArrowError::InvalidArgumentError(format!(
863                "Expected {} buffers in array of type {:?}, got {}",
864                layout.buffers.len(),
865                self.data_type,
866                self.buffers.len(),
867            )));
868        }
869
870        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
871            match spec {
872                BufferSpec::FixedWidth {
873                    byte_width,
874                    alignment,
875                } => {
876                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
877
878                    if buffer.len() < min_buffer_size {
879                        return Err(ArrowError::InvalidArgumentError(format!(
880                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
881                            min_buffer_size,
882                            i,
883                            self.data_type,
884                            buffer.len()
885                        )));
886                    }
887
888                    let align_offset = buffer.as_ptr().align_offset(*alignment);
889                    if align_offset != 0 {
890                        return Err(ArrowError::InvalidArgumentError(format!(
891                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
892                            self.data_type,
893                            align_offset.min(alignment - align_offset)
894                        )));
895                    }
896                }
897                BufferSpec::VariableWidth => {
898                    // not cheap to validate (need to look at the
899                    // data). Partially checked in validate_offsets
900                    // called below. Can check with `validate_full`
901                }
902                BufferSpec::BitMap => {
903                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
904                    if buffer.len() < min_buffer_size {
905                        return Err(ArrowError::InvalidArgumentError(format!(
906                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
907                            min_buffer_size,
908                            i,
909                            self.data_type,
910                            buffer.len()
911                        )));
912                    }
913                }
914                BufferSpec::AlwaysNull => {
915                    // Nothing to validate
916                }
917            }
918        }
919
920        // check null bit buffer size
921        if let Some(nulls) = self.nulls() {
922            if nulls.null_count() > self.len {
923                return Err(ArrowError::InvalidArgumentError(format!(
924                    "null_count {} for an array exceeds length of {} elements",
925                    nulls.null_count(),
926                    self.len
927                )));
928            }
929
930            let actual_len = nulls.validity().len();
931            let needed_len = bit_util::ceil(len_plus_offset, 8);
932            if actual_len < needed_len {
933                return Err(ArrowError::InvalidArgumentError(format!(
934                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
935                )));
936            }
937
938            if nulls.len() != self.len {
939                return Err(ArrowError::InvalidArgumentError(format!(
940                    "null buffer incorrect size. got {} expected {}",
941                    nulls.len(),
942                    self.len
943                )));
944            }
945        }
946
947        self.validate_child_data()?;
948
949        // Additional Type specific checks
950        match &self.data_type {
951            DataType::Utf8 | DataType::Binary => {
952                self.validate_offsets::<i32>(self.buffers[1].len())?;
953            }
954            DataType::LargeUtf8 | DataType::LargeBinary => {
955                self.validate_offsets::<i64>(self.buffers[1].len())?;
956            }
957            DataType::Dictionary(key_type, _value_type) => {
958                // At the moment, constructing a DictionaryArray will also check this
959                if !DataType::is_dictionary_key_type(key_type) {
960                    return Err(ArrowError::InvalidArgumentError(format!(
961                        "Dictionary key type must be integer, but was {key_type}"
962                    )));
963                }
964            }
965            DataType::RunEndEncoded(run_ends_type, _) => {
966                if run_ends_type.is_nullable() {
967                    return Err(ArrowError::InvalidArgumentError(
968                        "The nullable should be set to false for the field defining run_ends array.".to_string()
969                    ));
970                }
971                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
972                    return Err(ArrowError::InvalidArgumentError(format!(
973                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
974                        run_ends_type.data_type()
975                    )));
976                }
977            }
978            _ => {}
979        };
980
981        Ok(())
982    }
983
984    /// Returns a reference to the data in `buffer` as a typed slice
985    /// (typically `&[i32]` or `&[i64]`) after validating. The
986    /// returned slice is guaranteed to have at least `self.len + 1`
987    /// entries.
988    ///
989    /// For an empty array, the `buffer` can also be empty.
990    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
991        // An empty list-like array can have 0 offsets
992        if self.len == 0 && self.buffers[0].is_empty() {
993            return Ok(&[]);
994        }
995
996        let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
997
998        self.typed_buffer(0, len)
999    }
1000
1001    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
1002    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
1003        &self,
1004        idx: usize,
1005        len: usize,
1006    ) -> Result<&[T], ArrowError> {
1007        let buffer = &self.buffers[idx];
1008
1009        let required_elements = checked_len_plus_offset(&self.data_type, len, self.offset)?;
1010        let byte_width = mem::size_of::<T>();
1011        let required_len = required_elements.checked_mul(byte_width).ok_or_else(|| {
1012            ArrowError::InvalidArgumentError(format!(
1013                "Buffer {idx} of {} byte length overflow: {} elements of {} bytes exceeds usize",
1014                self.data_type, required_elements, byte_width
1015            ))
1016        })?;
1017
1018        if buffer.len() < required_len {
1019            return Err(ArrowError::InvalidArgumentError(format!(
1020                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
1021                idx,
1022                self.data_type,
1023                required_len,
1024                buffer.len()
1025            )));
1026        }
1027
1028        Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
1029    }
1030
1031    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1032    /// offsets (of type T) into some other buffer of `values_length` bytes long
1033    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1034        &self,
1035        values_length: usize,
1036    ) -> Result<(), ArrowError> {
1037        // Justification: buffer size was validated above
1038        let offsets = self.typed_offsets::<T>()?;
1039        if offsets.is_empty() {
1040            return Ok(());
1041        }
1042
1043        let first_offset = offsets[0].to_usize().ok_or_else(|| {
1044            ArrowError::InvalidArgumentError(format!(
1045                "Error converting offset[0] ({}) to usize for {}",
1046                offsets[0], self.data_type
1047            ))
1048        })?;
1049
1050        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1051            ArrowError::InvalidArgumentError(format!(
1052                "Error converting offset[{}] ({}) to usize for {}",
1053                self.len, offsets[self.len], self.data_type
1054            ))
1055        })?;
1056
1057        if first_offset > values_length {
1058            return Err(ArrowError::InvalidArgumentError(format!(
1059                "First offset {} of {} is larger than values length {}",
1060                first_offset, self.data_type, values_length,
1061            )));
1062        }
1063
1064        if last_offset > values_length {
1065            return Err(ArrowError::InvalidArgumentError(format!(
1066                "Last offset {} of {} is larger than values length {}",
1067                last_offset, self.data_type, values_length,
1068            )));
1069        }
1070
1071        if first_offset > last_offset {
1072            return Err(ArrowError::InvalidArgumentError(format!(
1073                "First offset {} in {} is smaller than last offset {}",
1074                first_offset, self.data_type, last_offset,
1075            )));
1076        }
1077
1078        Ok(())
1079    }
1080
1081    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1082    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
1083    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1084        &self,
1085        values_length: usize,
1086    ) -> Result<(), ArrowError> {
1087        let offsets: &[T] = self.typed_buffer(0, self.len)?;
1088        let sizes: &[T] = self.typed_buffer(1, self.len)?;
1089        if offsets.len() != sizes.len() {
1090            return Err(ArrowError::ComputeError(format!(
1091                "ListView offsets len {} does not match sizes len {}",
1092                offsets.len(),
1093                sizes.len()
1094            )));
1095        }
1096
1097        for i in 0..sizes.len() {
1098            let size = sizes[i].to_usize().ok_or_else(|| {
1099                ArrowError::InvalidArgumentError(format!(
1100                    "Error converting size[{}] ({}) to usize for {}",
1101                    i, sizes[i], self.data_type
1102                ))
1103            })?;
1104            let offset = offsets[i].to_usize().ok_or_else(|| {
1105                ArrowError::InvalidArgumentError(format!(
1106                    "Error converting offset[{}] ({}) to usize for {}",
1107                    i, offsets[i], self.data_type
1108                ))
1109            })?;
1110            if size
1111                .checked_add(offset)
1112                .expect("Offset and size have exceeded the usize boundary")
1113                > values_length
1114            {
1115                return Err(ArrowError::InvalidArgumentError(format!(
1116                    "Size {} at index {} is larger than the remaining values for {}",
1117                    size, i, self.data_type
1118                )));
1119            }
1120        }
1121        Ok(())
1122    }
1123
1124    /// Validates the layout of `child_data` ArrayData structures
1125    fn validate_child_data(&self) -> Result<(), ArrowError> {
1126        match &self.data_type {
1127            DataType::List(field) | DataType::Map(field, _) => {
1128                let values_data = self.get_single_valid_child_data(field.data_type())?;
1129                self.validate_offsets::<i32>(values_data.len)?;
1130                Ok(())
1131            }
1132            DataType::LargeList(field) => {
1133                let values_data = self.get_single_valid_child_data(field.data_type())?;
1134                self.validate_offsets::<i64>(values_data.len)?;
1135                Ok(())
1136            }
1137            DataType::ListView(field) => {
1138                let values_data = self.get_single_valid_child_data(field.data_type())?;
1139                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1140                Ok(())
1141            }
1142            DataType::LargeListView(field) => {
1143                let values_data = self.get_single_valid_child_data(field.data_type())?;
1144                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1145                Ok(())
1146            }
1147            DataType::FixedSizeList(field, list_size) => {
1148                let values_data = self.get_single_valid_child_data(field.data_type())?;
1149
1150                let list_size: usize = (*list_size).try_into().map_err(|_| {
1151                    ArrowError::InvalidArgumentError(format!(
1152                        "{} has a negative list_size {}",
1153                        self.data_type, list_size
1154                    ))
1155                })?;
1156
1157                let expected_values_len = self.len
1158                    .checked_mul(list_size)
1159                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1160
1161                if values_data.len < expected_values_len {
1162                    return Err(ArrowError::InvalidArgumentError(format!(
1163                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1164                        values_data.len, self.len, list_size, self.data_type
1165                    )));
1166                }
1167
1168                Ok(())
1169            }
1170            DataType::Struct(fields) => {
1171                self.validate_num_child_data(fields.len())?;
1172                for (i, field) in fields.iter().enumerate() {
1173                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1174
1175                    // Ensure child field has sufficient size
1176                    if field_data.len < self.len {
1177                        return Err(ArrowError::InvalidArgumentError(format!(
1178                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1179                            self.data_type,
1180                            i,
1181                            field.name(),
1182                            field_data.len,
1183                            self.len
1184                        )));
1185                    }
1186                }
1187                Ok(())
1188            }
1189            DataType::RunEndEncoded(run_ends_field, values_field) => {
1190                self.validate_num_child_data(2)?;
1191                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1192                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1193                if run_ends_data.len != values_data.len {
1194                    return Err(ArrowError::InvalidArgumentError(format!(
1195                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1196                        run_ends_data.len, values_data.len
1197                    )));
1198                }
1199                if run_ends_data.nulls.is_some() {
1200                    return Err(ArrowError::InvalidArgumentError(
1201                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1202                    ));
1203                }
1204                Ok(())
1205            }
1206            DataType::Union(fields, mode) => {
1207                self.validate_num_child_data(fields.len())?;
1208
1209                for (i, (_, field)) in fields.iter().enumerate() {
1210                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1211
1212                    if mode == &UnionMode::Sparse {
1213                        let len_plus_offset =
1214                            checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1215                        if field_data.len < len_plus_offset {
1216                            return Err(ArrowError::InvalidArgumentError(format!(
1217                                "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1218                                i, field_data.len, len_plus_offset
1219                            )));
1220                        }
1221                    }
1222                }
1223                Ok(())
1224            }
1225            DataType::Dictionary(_key_type, value_type) => {
1226                self.get_single_valid_child_data(value_type)?;
1227                Ok(())
1228            }
1229            _ => {
1230                // other types do not have child data
1231                if !self.child_data.is_empty() {
1232                    return Err(ArrowError::InvalidArgumentError(format!(
1233                        "Expected no child arrays for type {} but got {}",
1234                        self.data_type,
1235                        self.child_data.len()
1236                    )));
1237                }
1238                Ok(())
1239            }
1240        }
1241    }
1242
1243    /// Ensures that this array data has a single child_data with the
1244    /// expected type, and calls `validate()` on it. Returns a
1245    /// reference to that child_data
1246    fn get_single_valid_child_data(
1247        &self,
1248        expected_type: &DataType,
1249    ) -> Result<&ArrayData, ArrowError> {
1250        self.validate_num_child_data(1)?;
1251        self.get_valid_child_data(0, expected_type)
1252    }
1253
1254    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1255    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1256        if self.child_data.len() != expected_len {
1257            Err(ArrowError::InvalidArgumentError(format!(
1258                "Value data for {} should contain {} child data array(s), had {}",
1259                self.data_type,
1260                expected_len,
1261                self.child_data.len()
1262            )))
1263        } else {
1264            Ok(())
1265        }
1266    }
1267
1268    /// Ensures that `child_data[i]` has the expected type, calls
1269    /// `validate()` on it, and returns a reference to that child_data
1270    fn get_valid_child_data(
1271        &self,
1272        i: usize,
1273        expected_type: &DataType,
1274    ) -> Result<&ArrayData, ArrowError> {
1275        let values_data = self.child_data.get(i).ok_or_else(|| {
1276            ArrowError::InvalidArgumentError(format!(
1277                "{} did not have enough child arrays. Expected at least {} but had only {}",
1278                self.data_type,
1279                i + 1,
1280                self.child_data.len()
1281            ))
1282        })?;
1283
1284        if expected_type != &values_data.data_type {
1285            return Err(ArrowError::InvalidArgumentError(format!(
1286                "Child type mismatch for {}. Expected {} but child data had {}",
1287                self.data_type, expected_type, values_data.data_type
1288            )));
1289        }
1290
1291        values_data.validate()?;
1292        Ok(values_data)
1293    }
1294
1295    /// Validate that the data contained within this [`ArrayData`] is valid
1296    ///
1297    /// 1. Null count is correct
1298    /// 2. All offsets are valid
1299    /// 3. All String data is valid UTF-8
1300    /// 4. All dictionary offsets are valid
1301    ///
1302    /// Internally this calls:
1303    ///
1304    /// * [`Self::validate`]
1305    /// * [`Self::validate_nulls`]
1306    /// * [`Self::validate_values`]
1307    ///
1308    /// Note: this does not recurse into children, for a recursive variant
1309    /// see [`Self::validate_full`]
1310    pub fn validate_data(&self) -> Result<(), ArrowError> {
1311        self.validate()?;
1312
1313        self.validate_nulls()?;
1314        self.validate_values()?;
1315        Ok(())
1316    }
1317
1318    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1319    ///
1320    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1321    /// and all its children recursively
1322    pub fn validate_full(&self) -> Result<(), ArrowError> {
1323        self.validate_data()?;
1324        // validate all children recursively
1325        self.child_data
1326            .iter()
1327            .enumerate()
1328            .try_for_each(|(i, child_data)| {
1329                child_data.validate_full().map_err(|e| {
1330                    ArrowError::InvalidArgumentError(format!(
1331                        "{} child #{} invalid: {}",
1332                        self.data_type, i, e
1333                    ))
1334                })
1335            })?;
1336        Ok(())
1337    }
1338
1339    /// Validates the values stored within this [`ArrayData`] are valid
1340    /// without recursing into child [`ArrayData`]
1341    ///
1342    /// Does not (yet) check
1343    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1344    /// 2. the the null count is correct and that any
1345    /// 3. nullability requirements of its children are correct
1346    ///
1347    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1348    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1349        if let Some(nulls) = &self.nulls {
1350            let actual = nulls.len() - nulls.inner().count_set_bits();
1351            if actual != nulls.null_count() {
1352                return Err(ArrowError::InvalidArgumentError(format!(
1353                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1354                    nulls.null_count(),
1355                    actual
1356                )));
1357            }
1358        }
1359
1360        // In general non-nullable children should not contain nulls, however, for certain
1361        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1362        // space in the child. As such we permit nulls in the children in the corresponding
1363        // positions for such types
1364        match &self.data_type {
1365            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1366                if !f.is_nullable() {
1367                    self.validate_non_nullable(None, &self.child_data[0])?
1368                }
1369            }
1370            DataType::FixedSizeList(field, len) => {
1371                let child = &self.child_data[0];
1372                if !field.is_nullable() {
1373                    match &self.nulls {
1374                        Some(nulls) => {
1375                            let element_len = *len as usize;
1376                            let expanded = nulls.expand(element_len);
1377                            self.validate_non_nullable(Some(&expanded), child)?;
1378                        }
1379                        None => self.validate_non_nullable(None, child)?,
1380                    }
1381                }
1382            }
1383            DataType::Struct(fields) => {
1384                for (field, child) in fields.iter().zip(&self.child_data) {
1385                    if !field.is_nullable() {
1386                        self.validate_non_nullable(self.nulls(), child)?
1387                    }
1388                }
1389            }
1390            _ => {}
1391        }
1392
1393        Ok(())
1394    }
1395
1396    /// Verifies that `child` contains no nulls not present in `mask`
1397    fn validate_non_nullable(
1398        &self,
1399        mask: Option<&NullBuffer>,
1400        child: &ArrayData,
1401    ) -> Result<(), ArrowError> {
1402        let mask = match mask {
1403            Some(mask) => mask,
1404            None => {
1405                return match child.null_count() {
1406                    0 => Ok(()),
1407                    _ => Err(ArrowError::InvalidArgumentError(format!(
1408                        "non-nullable child of type {} contains nulls not present in parent {}",
1409                        child.data_type, self.data_type
1410                    ))),
1411                };
1412            }
1413        };
1414
1415        match child.nulls() {
1416            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1417                "non-nullable child of type {} contains nulls not present in parent",
1418                child.data_type
1419            ))),
1420            _ => Ok(()),
1421        }
1422    }
1423
1424    /// Validates the values stored within this [`ArrayData`] are valid
1425    /// without recursing into child [`ArrayData`]
1426    ///
1427    /// Does not (yet) check
1428    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1429    pub fn validate_values(&self) -> Result<(), ArrowError> {
1430        match &self.data_type {
1431            DataType::Utf8 => self.validate_utf8::<i32>(),
1432            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1433            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1434            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1435            DataType::BinaryView => {
1436                let views = self.typed_buffer::<u128>(0, self.len)?;
1437                validate_binary_view(views, &self.buffers[1..])
1438            }
1439            DataType::Utf8View => {
1440                let views = self.typed_buffer::<u128>(0, self.len)?;
1441                validate_string_view(views, &self.buffers[1..])
1442            }
1443            DataType::List(_) | DataType::Map(_, _) => {
1444                let child = &self.child_data[0];
1445                self.validate_offsets_full::<i32>(child.len)
1446            }
1447            DataType::LargeList(_) => {
1448                let child = &self.child_data[0];
1449                self.validate_offsets_full::<i64>(child.len)
1450            }
1451            DataType::Union(_, _) => {
1452                // Validate Union Array as part of implementing new Union semantics
1453                // See comments in `ArrayData::validate()`
1454                // https://github.com/apache/arrow-rs/issues/85
1455                //
1456                // TODO file follow on ticket for full union validation
1457                Ok(())
1458            }
1459            DataType::Dictionary(key_type, _value_type) => {
1460                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1461                let max_value = dictionary_length - 1;
1462                match key_type.as_ref() {
1463                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1464                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1465                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1466                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1467                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1468                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1469                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1470                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1471                    _ => unreachable!(),
1472                }
1473            }
1474            DataType::RunEndEncoded(run_ends, _values) => {
1475                let run_ends_data = self.child_data()[0].clone();
1476                match run_ends.data_type() {
1477                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1478                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1479                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1480                    _ => unreachable!(),
1481                }
1482            }
1483            _ => {
1484                // No extra validation check required for other types
1485                Ok(())
1486            }
1487        }
1488    }
1489
1490    /// Calls the `validate(item_index, range)` function for each of
1491    /// the ranges specified in the arrow offsets buffer of type
1492    /// `T`. Also validates that each offset is smaller than
1493    /// `offset_limit`
1494    ///
1495    /// For an empty array, the offsets buffer can either be empty
1496    /// or contain a single `0`.
1497    ///
1498    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1499    /// function would call `validate([1,2])`, and `validate([2,4])`
1500    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1501    where
1502        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1503        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1504    {
1505        self.typed_offsets::<T>()?
1506            .iter()
1507            .enumerate()
1508            .map(|(i, x)| {
1509                // check if the offset can be converted to usize
1510                let r = x.to_usize().ok_or_else(|| {
1511                    ArrowError::InvalidArgumentError(format!(
1512                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1513                    );
1514                // check if the offset exceeds the limit
1515                match r {
1516                    Ok(n) if n <= offset_limit => Ok((i, n)),
1517                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1518                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1519                    ),
1520                    Err(e) => Err(e),
1521                }
1522            })
1523            .scan(0_usize, |start, end| {
1524                // check offsets are monotonically increasing
1525                match end {
1526                    Ok((i, end)) if *start <= end => {
1527                        let range = Some(Ok((i, *start..end)));
1528                        *start = end;
1529                        range
1530                    }
1531                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1532                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1533                        i - 1, start, end))
1534                    )),
1535                    Err(err) => Some(Err(err)),
1536                }
1537            })
1538            .skip(1) // the first element is meaningless
1539            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1540                let (item_index, range) = res?;
1541                validate(item_index-1, range)
1542            })
1543    }
1544
1545    /// Ensures that all strings formed by the offsets in `buffers[0]`
1546    /// into `buffers[1]` are valid utf8 sequences
1547    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1548    where
1549        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1550    {
1551        let values_buffer = &self.buffers[1].as_slice();
1552        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1553            // Validate Offsets are correct
1554            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1555                if !values_str.is_char_boundary(range.start)
1556                    || !values_str.is_char_boundary(range.end)
1557                {
1558                    return Err(ArrowError::InvalidArgumentError(format!(
1559                        "incomplete utf-8 byte sequence from index {string_index}"
1560                    )));
1561                }
1562                Ok(())
1563            })
1564        } else {
1565            // find specific offset that failed utf8 validation
1566            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1567                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1568                    ArrowError::InvalidArgumentError(format!(
1569                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1570                    ))
1571                })?;
1572                Ok(())
1573            })
1574        }
1575    }
1576
1577    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1578    /// between `0` and `offset_limit`
1579    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1580    where
1581        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1582    {
1583        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1584            // No validation applied to each value, but the iteration
1585            // itself applies bounds checking to each range
1586            Ok(())
1587        })
1588    }
1589
1590    /// Validates that each value in self.buffers (typed as T)
1591    /// is within the range [0, max_value], inclusive
1592    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1593    where
1594        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1595    {
1596        let required_len = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1597        let buffer = &self.buffers[0];
1598
1599        // This should have been checked as part of `validate()` prior
1600        // to calling `validate_full()` but double check to be sure
1601        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1602
1603        // Justification: buffer size was validated above
1604        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..required_len];
1605
1606        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1607            // Do not check the value is null (value can be arbitrary)
1608            if self.is_null(i) {
1609                return Ok(());
1610            }
1611            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1612                ArrowError::InvalidArgumentError(format!(
1613                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1614                ))
1615            })?;
1616
1617            if dict_index < 0 || dict_index > max_value {
1618                return Err(ArrowError::InvalidArgumentError(format!(
1619                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1620                )));
1621            }
1622            Ok(())
1623        })
1624    }
1625
1626    /// Validates that each value in run_ends array is positive and strictly increasing.
1627    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1628    where
1629        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1630    {
1631        let values = self.typed_buffer::<T>(0, self.len)?;
1632        let mut prev_value: i64 = 0_i64;
1633        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1634            let value: i64 = inp_value.try_into().map_err(|_| {
1635                ArrowError::InvalidArgumentError(format!(
1636                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1637                ))
1638            })?;
1639            if value <= 0_i64 {
1640                return Err(ArrowError::InvalidArgumentError(format!(
1641                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1642                )));
1643            }
1644            if ix > 0 && value <= prev_value {
1645                return Err(ArrowError::InvalidArgumentError(format!(
1646                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1647                )));
1648            }
1649
1650            prev_value = value;
1651            Ok(())
1652        })?;
1653
1654        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1655        if prev_value.as_usize() < len_plus_offset {
1656            return Err(ArrowError::InvalidArgumentError(format!(
1657                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1658                len_plus_offset
1659            )));
1660        }
1661        Ok(())
1662    }
1663
1664    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1665    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1666    /// return false when the arrays are logically equal
1667    pub fn ptr_eq(&self, other: &Self) -> bool {
1668        if self.offset != other.offset
1669            || self.len != other.len
1670            || self.data_type != other.data_type
1671            || self.buffers.len() != other.buffers.len()
1672            || self.child_data.len() != other.child_data.len()
1673        {
1674            return false;
1675        }
1676
1677        match (&self.nulls, &other.nulls) {
1678            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1679            (Some(_), None) | (None, Some(_)) => return false,
1680            _ => {}
1681        };
1682
1683        if !self
1684            .buffers
1685            .iter()
1686            .zip(other.buffers.iter())
1687            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1688        {
1689            return false;
1690        }
1691
1692        self.child_data
1693            .iter()
1694            .zip(other.child_data.iter())
1695            .all(|(a, b)| a.ptr_eq(b))
1696    }
1697
1698    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1699    pub fn into_builder(self) -> ArrayDataBuilder {
1700        self.into()
1701    }
1702
1703    /// Claim memory used by this ArrayData in the provided memory pool.
1704    ///
1705    /// This claims memory for:
1706    /// - All buffers in self.buffers
1707    /// - All child ArrayData recursively
1708    /// - The null buffer if present
1709    #[cfg(feature = "pool")]
1710    pub fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
1711        // Claim all data buffers
1712        for buffer in &self.buffers {
1713            buffer.claim(pool);
1714        }
1715
1716        // Claim null buffer if present
1717        if let Some(nulls) = &self.nulls {
1718            nulls.claim(pool);
1719        }
1720
1721        // Recursively claim child data
1722        for child in &self.child_data {
1723            child.claim(pool);
1724        }
1725    }
1726}
1727
1728/// Return the expected [`DataTypeLayout`] Arrays of this data
1729/// type are expected to have
1730pub fn layout(data_type: &DataType) -> DataTypeLayout {
1731    // based on C/C++ implementation in
1732    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1733    use arrow_schema::IntervalUnit::*;
1734
1735    match data_type {
1736        DataType::Null => DataTypeLayout {
1737            buffers: vec![],
1738            can_contain_null_mask: false,
1739            variadic: false,
1740        },
1741        DataType::Boolean => DataTypeLayout {
1742            buffers: vec![BufferSpec::BitMap],
1743            can_contain_null_mask: true,
1744            variadic: false,
1745        },
1746        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1747        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1748        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1749        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1750        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1751        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1752        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1753        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1754        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1755        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1756        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1757        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1758        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1759        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1760        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1761        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1762        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1763        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1764        DataType::Interval(MonthDayNano) => {
1765            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1766        }
1767        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1768        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1769        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1770        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1771        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1772        DataType::FixedSizeBinary(size) => {
1773            let spec = BufferSpec::FixedWidth {
1774                byte_width: (*size).try_into().unwrap(),
1775                alignment: mem::align_of::<u8>(),
1776            };
1777            DataTypeLayout {
1778                buffers: vec![spec],
1779                can_contain_null_mask: true,
1780                variadic: false,
1781            }
1782        }
1783        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1784        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1785        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1786        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1787        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1788        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1789        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1790        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1791        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1792        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1793        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1794        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1795        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1796        DataType::Union(_, mode) => {
1797            let type_ids = BufferSpec::FixedWidth {
1798                byte_width: mem::size_of::<i8>(),
1799                alignment: mem::align_of::<i8>(),
1800            };
1801
1802            DataTypeLayout {
1803                buffers: match mode {
1804                    UnionMode::Sparse => {
1805                        vec![type_ids]
1806                    }
1807                    UnionMode::Dense => {
1808                        vec![
1809                            type_ids,
1810                            BufferSpec::FixedWidth {
1811                                byte_width: mem::size_of::<i32>(),
1812                                alignment: mem::align_of::<i32>(),
1813                            },
1814                        ]
1815                    }
1816                },
1817                can_contain_null_mask: false,
1818                variadic: false,
1819            }
1820        }
1821        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1822    }
1823}
1824
1825/// Layout specification for a data type
1826#[derive(Debug, PartialEq, Eq)]
1827// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1828pub struct DataTypeLayout {
1829    /// A vector of buffer layout specifications, one for each expected buffer
1830    pub buffers: Vec<BufferSpec>,
1831
1832    /// Can contain a null bitmask
1833    pub can_contain_null_mask: bool,
1834
1835    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1836    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1837    /// buffers.len(). Buffers that exceed the lower bound are legal.
1838    pub variadic: bool,
1839}
1840
1841impl DataTypeLayout {
1842    /// Describes a basic numeric array where each element has type `T`
1843    pub fn new_fixed_width<T>() -> Self {
1844        Self {
1845            buffers: vec![BufferSpec::FixedWidth {
1846                byte_width: mem::size_of::<T>(),
1847                alignment: mem::align_of::<T>(),
1848            }],
1849            can_contain_null_mask: true,
1850            variadic: false,
1851        }
1852    }
1853
1854    /// Describes arrays which have no data of their own
1855    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1856    pub fn new_nullable_empty() -> Self {
1857        Self {
1858            buffers: vec![],
1859            can_contain_null_mask: true,
1860            variadic: false,
1861        }
1862    }
1863
1864    /// Describes arrays which have no data of their own
1865    /// (e.g. RunEndEncoded).
1866    pub fn new_empty() -> Self {
1867        Self {
1868            buffers: vec![],
1869            can_contain_null_mask: false,
1870            variadic: false,
1871        }
1872    }
1873
1874    /// Describes a basic numeric array where each element has a fixed
1875    /// with offset buffer of type `T`, followed by a
1876    /// variable width data buffer
1877    pub fn new_binary<T>() -> Self {
1878        Self {
1879            buffers: vec![
1880                // offsets
1881                BufferSpec::FixedWidth {
1882                    byte_width: mem::size_of::<T>(),
1883                    alignment: mem::align_of::<T>(),
1884                },
1885                // values
1886                BufferSpec::VariableWidth,
1887            ],
1888            can_contain_null_mask: true,
1889            variadic: false,
1890        }
1891    }
1892
1893    /// Describes a view type
1894    pub fn new_view() -> Self {
1895        Self {
1896            buffers: vec![BufferSpec::FixedWidth {
1897                byte_width: mem::size_of::<u128>(),
1898                alignment: mem::align_of::<u128>(),
1899            }],
1900            can_contain_null_mask: true,
1901            variadic: true,
1902        }
1903    }
1904
1905    /// Describes a list view type
1906    pub fn new_list_view<T>() -> Self {
1907        Self {
1908            buffers: vec![
1909                BufferSpec::FixedWidth {
1910                    byte_width: mem::size_of::<T>(),
1911                    alignment: mem::align_of::<T>(),
1912                },
1913                BufferSpec::FixedWidth {
1914                    byte_width: mem::size_of::<T>(),
1915                    alignment: mem::align_of::<T>(),
1916                },
1917            ],
1918            can_contain_null_mask: true,
1919            variadic: false,
1920        }
1921    }
1922}
1923
1924/// Layout specification for a single data type buffer
1925#[derive(Debug, PartialEq, Eq)]
1926pub enum BufferSpec {
1927    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1928    ///
1929    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1930    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1931    ///
1932    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1933    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1934    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1935    ///
1936    /// Note that these alignment requirements will vary between architectures
1937    FixedWidth {
1938        /// The width of each element in bytes
1939        byte_width: usize,
1940        /// The alignment required by Rust for an array of the corresponding primitive
1941        alignment: usize,
1942    },
1943    /// Variable width, such as string data for utf8 data
1944    VariableWidth,
1945    /// Buffer holds a bitmap.
1946    ///
1947    /// Note: Unlike the C++ implementation, the null/validity buffer
1948    /// is handled specially rather than as another of the buffers in
1949    /// the spec, so this variant is only used for the Boolean type.
1950    BitMap,
1951    /// Buffer is always null. Unused currently in Rust implementation,
1952    /// (used in C++ for Union type)
1953    #[allow(dead_code)]
1954    AlwaysNull,
1955}
1956
1957impl PartialEq for ArrayData {
1958    fn eq(&self, other: &Self) -> bool {
1959        equal::equal(self, other)
1960    }
1961}
1962
1963/// A boolean flag that cannot be mutated outside of unsafe code.
1964///
1965/// Defaults to a value of false.
1966///
1967/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1968///
1969/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1970///
1971/// # Example
1972/// ```rust
1973/// use arrow_data::UnsafeFlag;
1974/// assert!(!UnsafeFlag::default().get()); // default is false
1975/// let mut flag = UnsafeFlag::new();
1976/// assert!(!flag.get()); // defaults to false
1977/// // can only set it to true in unsafe code
1978/// unsafe { flag.set(true) };
1979/// assert!(flag.get()); // now true
1980/// ```
1981#[derive(Debug, Clone)]
1982#[doc(hidden)]
1983pub struct UnsafeFlag(bool);
1984
1985impl UnsafeFlag {
1986    /// Creates a new `UnsafeFlag` with the value set to `false`.
1987    ///
1988    /// See examples on [`Self::new`]
1989    #[inline]
1990    pub const fn new() -> Self {
1991        Self(false)
1992    }
1993
1994    /// Sets the value of the flag to the given value
1995    ///
1996    /// Note this can purposely only be done in `unsafe` code
1997    ///
1998    /// # Safety
1999    ///
2000    /// If set, the flag will be set to the given value. There is nothing
2001    /// immediately unsafe about doing so, however, the flag can be used to
2002    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
2003    #[inline]
2004    pub unsafe fn set(&mut self, val: bool) {
2005        self.0 = val;
2006    }
2007
2008    /// Returns the value of the flag
2009    #[inline]
2010    pub fn get(&self) -> bool {
2011        self.0
2012    }
2013}
2014
2015// Manual impl to make it clear you can not construct unsafe with true
2016impl Default for UnsafeFlag {
2017    fn default() -> Self {
2018        Self::new()
2019    }
2020}
2021
2022/// Builder for [`ArrayData`] type
2023#[derive(Debug)]
2024pub struct ArrayDataBuilder {
2025    data_type: DataType,
2026    len: usize,
2027    null_count: Option<usize>,
2028    null_bit_buffer: Option<Buffer>,
2029    nulls: Option<NullBuffer>,
2030    offset: usize,
2031    buffers: Vec<Buffer>,
2032    child_data: Vec<ArrayData>,
2033    /// Should buffers be realigned (copying if necessary)?
2034    ///
2035    /// Defaults to false.
2036    align_buffers: bool,
2037    /// Should data validation be skipped for this [`ArrayData`]?
2038    ///
2039    /// Defaults to false.
2040    ///
2041    /// # Safety
2042    ///
2043    /// This flag can only be set to true using `unsafe` APIs. However, once true
2044    /// subsequent calls to `build()` may result in undefined behavior if the data
2045    /// is not valid.
2046    skip_validation: UnsafeFlag,
2047}
2048
2049impl ArrayDataBuilder {
2050    #[inline]
2051    /// Creates a new array data builder
2052    pub const fn new(data_type: DataType) -> Self {
2053        Self {
2054            data_type,
2055            len: 0,
2056            null_count: None,
2057            null_bit_buffer: None,
2058            nulls: None,
2059            offset: 0,
2060            buffers: vec![],
2061            child_data: vec![],
2062            align_buffers: false,
2063            skip_validation: UnsafeFlag::new(),
2064        }
2065    }
2066
2067    /// Creates a new array data builder from an existing one, changing the data type
2068    pub fn data_type(self, data_type: DataType) -> Self {
2069        Self { data_type, ..self }
2070    }
2071
2072    #[inline]
2073    #[allow(clippy::len_without_is_empty)]
2074    /// Sets the length of the [ArrayData]
2075    pub const fn len(mut self, n: usize) -> Self {
2076        self.len = n;
2077        self
2078    }
2079
2080    /// Sets the null buffer of the [ArrayData]
2081    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2082        self.nulls = nulls;
2083        self.null_count = None;
2084        self.null_bit_buffer = None;
2085        self
2086    }
2087
2088    /// Sets the null count of the [ArrayData]
2089    pub fn null_count(mut self, null_count: usize) -> Self {
2090        self.null_count = Some(null_count);
2091        self
2092    }
2093
2094    /// Sets the `null_bit_buffer` of the [ArrayData]
2095    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2096        self.nulls = None;
2097        self.null_bit_buffer = buf;
2098        self
2099    }
2100
2101    /// Sets the offset of the [ArrayData]
2102    #[inline]
2103    pub const fn offset(mut self, n: usize) -> Self {
2104        self.offset = n;
2105        self
2106    }
2107
2108    /// Sets the buffers of the [ArrayData]
2109    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2110        self.buffers = v;
2111        self
2112    }
2113
2114    /// Adds a single buffer to the [ArrayData]'s buffers
2115    pub fn add_buffer(mut self, b: Buffer) -> Self {
2116        self.buffers.push(b);
2117        self
2118    }
2119
2120    /// Adds multiple buffers to the [ArrayData]'s buffers
2121    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2122        self.buffers.extend(bs);
2123        self
2124    }
2125
2126    /// Sets the child data of the [ArrayData]
2127    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2128        self.child_data = v;
2129        self
2130    }
2131
2132    /// Adds a single child data to the [ArrayData]'s child data
2133    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2134        self.child_data.push(r);
2135        self
2136    }
2137
2138    /// Creates an array data, without any validation
2139    ///
2140    /// Note: This is shorthand for
2141    /// ```rust
2142    /// # #[expect(unsafe_op_in_unsafe_fn)]
2143    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2144    /// # let _ = unsafe {
2145    /// builder.skip_validation(true).build().unwrap()
2146    /// # };
2147    /// ```
2148    ///
2149    /// # Safety
2150    ///
2151    /// The same caveats as [`ArrayData::new_unchecked`]
2152    /// apply.
2153    pub unsafe fn build_unchecked(self) -> ArrayData {
2154        unsafe { self.skip_validation(true) }.build().unwrap()
2155    }
2156
2157    /// Creates an `ArrayData`, consuming `self`
2158    ///
2159    /// # Safety
2160    ///
2161    /// By default the underlying buffers are checked to ensure they are valid
2162    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2163    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2164    /// not valid, undefined behavior will result.
2165    pub fn build(self) -> Result<ArrayData, ArrowError> {
2166        let Self {
2167            data_type,
2168            len,
2169            null_count,
2170            null_bit_buffer,
2171            nulls,
2172            offset,
2173            buffers,
2174            child_data,
2175            align_buffers,
2176            skip_validation,
2177        } = self;
2178
2179        let nulls = nulls
2180            .or_else(|| {
2181                let buffer = null_bit_buffer?;
2182                let buffer = BooleanBuffer::new(buffer, offset, len);
2183                Some(match null_count {
2184                    Some(n) => {
2185                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2186                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2187                    }
2188                    None => NullBuffer::new(buffer),
2189                })
2190            })
2191            .filter(|b| b.null_count() != 0);
2192
2193        let mut data = ArrayData {
2194            data_type,
2195            len,
2196            offset,
2197            buffers,
2198            child_data,
2199            nulls,
2200        };
2201
2202        if align_buffers {
2203            data.align_buffers();
2204        }
2205
2206        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2207        if !skip_validation.get() || cfg!(feature = "force_validate") {
2208            data.validate_data()?;
2209        }
2210        Ok(data)
2211    }
2212
2213    /// Ensure that all buffers are aligned, copying data if necessary
2214    ///
2215    /// Rust requires that arrays are aligned to their corresponding primitive,
2216    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2217    ///
2218    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2219    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2220    ///
2221    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2222    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2223    /// when necessary, making it useful when interacting with buffers produced by other systems,
2224    /// e.g. IPC or FFI.
2225    ///
2226    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2227    /// insufficiently aligned buffers.
2228    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2229        self.align_buffers = align_buffers;
2230        self
2231    }
2232
2233    /// Skips validation of the data.
2234    ///
2235    /// If this flag is enabled, `[Self::build`] will skip validation of the
2236    /// data
2237    ///
2238    /// If this flag is not enabled, `[Self::build`] will validate that all
2239    /// buffers are valid and will return an error if any data is invalid.
2240    /// Validation can be expensive.
2241    ///
2242    /// # Safety
2243    ///
2244    /// If validation is skipped, the buffers must form a valid Arrow array,
2245    /// otherwise undefined behavior will result
2246    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2247        unsafe {
2248            self.skip_validation.set(skip_validation);
2249        }
2250        self
2251    }
2252}
2253
2254impl From<ArrayData> for ArrayDataBuilder {
2255    fn from(d: ArrayData) -> Self {
2256        Self {
2257            data_type: d.data_type,
2258            len: d.len,
2259            offset: d.offset,
2260            buffers: d.buffers,
2261            child_data: d.child_data,
2262            nulls: d.nulls,
2263            null_bit_buffer: None,
2264            null_count: None,
2265            align_buffers: false,
2266            skip_validation: UnsafeFlag::new(),
2267        }
2268    }
2269}
2270
2271/// Get byte width of FixedSizeBinary size
2272/// # Panics:
2273/// - Panics if the `data_type` is not FixedSizeBinary
2274/// - Panics if byte width is negative
2275pub(crate) fn get_fixed_size_binary_width(data_type: &DataType) -> usize {
2276    match data_type {
2277        DataType::FixedSizeBinary(i) => {
2278            if *i < 0 {
2279                panic!("cannot compare FixedSizeBinary({})", *i);
2280            }
2281            *i as usize
2282        }
2283        _ => unreachable!(),
2284    }
2285}
2286
2287#[cfg(test)]
2288mod tests {
2289    use super::*;
2290    use arrow_schema::{Field, Fields};
2291
2292    // See arrow/tests/array_data_validation.rs for test of array validation
2293
2294    /// returns a buffer initialized with some constant value for tests
2295    fn make_i32_buffer(n: usize) -> Buffer {
2296        Buffer::from_slice_ref(vec![42i32; n])
2297    }
2298
2299    /// returns a buffer initialized with some constant value for tests
2300    fn make_f32_buffer(n: usize) -> Buffer {
2301        Buffer::from_slice_ref(vec![42f32; n])
2302    }
2303
2304    #[test]
2305    fn test_builder() {
2306        // Buffer needs to be at least 25 long
2307        let v = (0..25).collect::<Vec<i32>>();
2308        let b1 = Buffer::from_slice_ref(&v);
2309        let arr_data = ArrayData::builder(DataType::Int32)
2310            .len(20)
2311            .offset(5)
2312            .add_buffer(b1)
2313            .null_bit_buffer(Some(Buffer::from([
2314                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2315            ])))
2316            .build()
2317            .unwrap();
2318
2319        assert_eq!(20, arr_data.len());
2320        assert_eq!(10, arr_data.null_count());
2321        assert_eq!(5, arr_data.offset());
2322        assert_eq!(1, arr_data.buffers().len());
2323        assert_eq!(
2324            Buffer::from_slice_ref(&v).as_slice(),
2325            arr_data.buffers()[0].as_slice()
2326        );
2327    }
2328
2329    #[test]
2330    fn test_builder_with_child_data() {
2331        let child_arr_data = ArrayData::try_new(
2332            DataType::Int32,
2333            5,
2334            None,
2335            0,
2336            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2337            vec![],
2338        )
2339        .unwrap();
2340
2341        let field = Arc::new(Field::new("x", DataType::Int32, true));
2342        let data_type = DataType::Struct(vec![field].into());
2343
2344        let arr_data = ArrayData::builder(data_type)
2345            .len(5)
2346            .offset(0)
2347            .add_child_data(child_arr_data.clone())
2348            .build()
2349            .unwrap();
2350
2351        assert_eq!(5, arr_data.len());
2352        assert_eq!(1, arr_data.child_data().len());
2353        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2354    }
2355
2356    #[test]
2357    fn test_null_count() {
2358        let mut bit_v: [u8; 2] = [0; 2];
2359        bit_util::set_bit(&mut bit_v, 0);
2360        bit_util::set_bit(&mut bit_v, 3);
2361        bit_util::set_bit(&mut bit_v, 10);
2362        let arr_data = ArrayData::builder(DataType::Int32)
2363            .len(16)
2364            .add_buffer(make_i32_buffer(16))
2365            .null_bit_buffer(Some(Buffer::from(bit_v)))
2366            .build()
2367            .unwrap();
2368        assert_eq!(13, arr_data.null_count());
2369
2370        // Test with offset
2371        let mut bit_v: [u8; 2] = [0; 2];
2372        bit_util::set_bit(&mut bit_v, 0);
2373        bit_util::set_bit(&mut bit_v, 3);
2374        bit_util::set_bit(&mut bit_v, 10);
2375        let arr_data = ArrayData::builder(DataType::Int32)
2376            .len(12)
2377            .offset(2)
2378            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2379            .null_bit_buffer(Some(Buffer::from(bit_v)))
2380            .build()
2381            .unwrap();
2382        assert_eq!(10, arr_data.null_count());
2383    }
2384
2385    #[test]
2386    fn test_null_buffer_ref() {
2387        let mut bit_v: [u8; 2] = [0; 2];
2388        bit_util::set_bit(&mut bit_v, 0);
2389        bit_util::set_bit(&mut bit_v, 3);
2390        bit_util::set_bit(&mut bit_v, 10);
2391        let arr_data = ArrayData::builder(DataType::Int32)
2392            .len(16)
2393            .add_buffer(make_i32_buffer(16))
2394            .null_bit_buffer(Some(Buffer::from(bit_v)))
2395            .build()
2396            .unwrap();
2397        assert!(arr_data.nulls().is_some());
2398        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2399    }
2400
2401    #[test]
2402    fn test_slice() {
2403        let mut bit_v: [u8; 2] = [0; 2];
2404        bit_util::set_bit(&mut bit_v, 0);
2405        bit_util::set_bit(&mut bit_v, 3);
2406        bit_util::set_bit(&mut bit_v, 10);
2407        let data = ArrayData::builder(DataType::Int32)
2408            .len(16)
2409            .add_buffer(make_i32_buffer(16))
2410            .null_bit_buffer(Some(Buffer::from(bit_v)))
2411            .build()
2412            .unwrap();
2413        let new_data = data.slice(1, 15);
2414        assert_eq!(data.len() - 1, new_data.len());
2415        assert_eq!(1, new_data.offset());
2416        assert_eq!(data.null_count(), new_data.null_count());
2417
2418        // slice of a slice (removes one null)
2419        let new_data = new_data.slice(1, 14);
2420        assert_eq!(data.len() - 2, new_data.len());
2421        assert_eq!(2, new_data.offset());
2422        assert_eq!(data.null_count() - 1, new_data.null_count());
2423    }
2424
2425    #[test]
2426    #[should_panic(expected = "offset + length overflow")]
2427    fn test_slice_panics_on_offset_length_overflow() {
2428        let data = ArrayData::builder(DataType::Int32)
2429            .len(4)
2430            .add_buffer(make_i32_buffer(4))
2431            .build()
2432            .unwrap();
2433        let sliced = data.slice(1, 3);
2434
2435        sliced.slice(1, usize::MAX);
2436    }
2437
2438    #[test]
2439    fn test_typed_offsets_length_overflow() {
2440        let data = ArrayData {
2441            data_type: DataType::Binary,
2442            len: usize::MAX,
2443            offset: 0,
2444            buffers: vec![Buffer::from_slice_ref([0_i32])],
2445            child_data: vec![],
2446            nulls: None,
2447        };
2448        let err = data.typed_offsets::<i32>().unwrap_err();
2449
2450        assert_eq!(
2451            err.to_string(),
2452            format!(
2453                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2454                usize::MAX
2455            )
2456        );
2457    }
2458
2459    #[test]
2460    fn test_validate_typed_buffer_length_overflow() {
2461        let data = ArrayData {
2462            data_type: DataType::Binary,
2463            len: 0,
2464            offset: 2,
2465            buffers: vec![Buffer::from_slice_ref([0_i32])],
2466            child_data: vec![],
2467            nulls: None,
2468        };
2469        let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
2470
2471        assert_eq!(
2472            err.to_string(),
2473            format!(
2474                "Invalid argument error: Length {} with offset 2 overflows usize for Binary",
2475                usize::MAX
2476            )
2477        );
2478    }
2479
2480    // Exercises ArrayData::try_new with len + offset overflowing
2481    fn try_new_binary_length_offset_overflow() -> Result<ArrayData, ArrowError> {
2482        ArrayData::try_new(
2483            DataType::Binary,
2484            usize::MAX,
2485            None,
2486            1,
2487            vec![
2488                Buffer::from_slice_ref([0_i32]),
2489                Buffer::from_iter(std::iter::empty::<u8>()),
2490            ],
2491            vec![],
2492        )
2493    }
2494
2495    #[cfg(not(feature = "force_validate"))]
2496    #[test]
2497    fn test_try_new_length_offset_overflow() {
2498        let err = try_new_binary_length_offset_overflow().unwrap_err();
2499
2500        assert_eq!(
2501            err.to_string(),
2502            format!(
2503                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2504                usize::MAX
2505            )
2506        );
2507    }
2508
2509    #[cfg(feature = "force_validate")]
2510    #[test]
2511    #[should_panic(
2512        expected = "Length 18446744073709551615 with offset 1 overflows usize for Binary"
2513    )]
2514    fn test_try_new_length_offset_overflow_force_validate() {
2515        try_new_binary_length_offset_overflow().unwrap();
2516    }
2517
2518    #[test]
2519    fn test_equality() {
2520        let int_data = ArrayData::builder(DataType::Int32)
2521            .len(1)
2522            .add_buffer(make_i32_buffer(1))
2523            .build()
2524            .unwrap();
2525
2526        let float_data = ArrayData::builder(DataType::Float32)
2527            .len(1)
2528            .add_buffer(make_f32_buffer(1))
2529            .build()
2530            .unwrap();
2531        assert_ne!(int_data, float_data);
2532        assert!(!int_data.ptr_eq(&float_data));
2533        assert!(int_data.ptr_eq(&int_data));
2534
2535        #[allow(clippy::redundant_clone)]
2536        let int_data_clone = int_data.clone();
2537        assert_eq!(int_data, int_data_clone);
2538        assert!(int_data.ptr_eq(&int_data_clone));
2539        assert!(int_data_clone.ptr_eq(&int_data));
2540
2541        let int_data_slice = int_data_clone.slice(1, 0);
2542        assert!(int_data_slice.ptr_eq(&int_data_slice));
2543        assert!(!int_data.ptr_eq(&int_data_slice));
2544        assert!(!int_data_slice.ptr_eq(&int_data));
2545
2546        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2547        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2548        let string_data = ArrayData::try_new(
2549            DataType::Utf8,
2550            3,
2551            Some(Buffer::from_iter(vec![true, false, true])),
2552            0,
2553            vec![offsets_buffer, data_buffer],
2554            vec![],
2555        )
2556        .unwrap();
2557
2558        assert_ne!(float_data, string_data);
2559        assert!(!float_data.ptr_eq(&string_data));
2560
2561        assert!(string_data.ptr_eq(&string_data));
2562
2563        #[allow(clippy::redundant_clone)]
2564        let string_data_cloned = string_data.clone();
2565        assert!(string_data_cloned.ptr_eq(&string_data));
2566        assert!(string_data.ptr_eq(&string_data_cloned));
2567
2568        let string_data_slice = string_data.slice(1, 2);
2569        assert!(string_data_slice.ptr_eq(&string_data_slice));
2570        assert!(!string_data_slice.ptr_eq(&string_data))
2571    }
2572
2573    #[test]
2574    fn test_slice_memory_size() {
2575        let mut bit_v: [u8; 2] = [0; 2];
2576        bit_util::set_bit(&mut bit_v, 0);
2577        bit_util::set_bit(&mut bit_v, 3);
2578        bit_util::set_bit(&mut bit_v, 10);
2579        let data = ArrayData::builder(DataType::Int32)
2580            .len(16)
2581            .add_buffer(make_i32_buffer(16))
2582            .null_bit_buffer(Some(Buffer::from(bit_v)))
2583            .build()
2584            .unwrap();
2585        let new_data = data.slice(1, 14);
2586        assert_eq!(
2587            data.get_slice_memory_size().unwrap() - 8,
2588            new_data.get_slice_memory_size().unwrap()
2589        );
2590        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2591        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2592        let string_data = ArrayData::try_new(
2593            DataType::Utf8,
2594            3,
2595            Some(Buffer::from_iter(vec![true, false, true])),
2596            0,
2597            vec![offsets_buffer, data_buffer],
2598            vec![],
2599        )
2600        .unwrap();
2601        let string_data_slice = string_data.slice(1, 2);
2602        //4 bytes of offset and 2 bytes of data reduced by slicing.
2603        assert_eq!(
2604            string_data.get_slice_memory_size().unwrap() - 6,
2605            string_data_slice.get_slice_memory_size().unwrap()
2606        );
2607    }
2608
2609    #[test]
2610    fn test_count_nulls() {
2611        let buffer = Buffer::from([0b00010110, 0b10011111]);
2612        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2613        let count = count_nulls(Some(&buffer), 0, 16);
2614        assert_eq!(count, 7);
2615
2616        let count = count_nulls(Some(&buffer), 4, 8);
2617        assert_eq!(count, 3);
2618    }
2619
2620    #[test]
2621    fn test_contains_nulls() {
2622        let buffer: Buffer =
2623            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2624        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2625        assert!(contains_nulls(Some(&buffer), 0, 6));
2626        assert!(contains_nulls(Some(&buffer), 0, 3));
2627        assert!(!contains_nulls(Some(&buffer), 3, 2));
2628        assert!(!contains_nulls(Some(&buffer), 0, 0));
2629    }
2630
2631    #[test]
2632    fn test_alignment() {
2633        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2634        let sliced = buffer.slice(1);
2635
2636        let mut data = ArrayData {
2637            data_type: DataType::Int32,
2638            len: 0,
2639            offset: 0,
2640            buffers: vec![buffer],
2641            child_data: vec![],
2642            nulls: None,
2643        };
2644        data.validate_full().unwrap();
2645
2646        // break alignment in data
2647        data.buffers[0] = sliced;
2648        let err = data.validate().unwrap_err();
2649
2650        assert_eq!(
2651            err.to_string(),
2652            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2653        );
2654
2655        data.align_buffers();
2656        data.validate_full().unwrap();
2657    }
2658
2659    #[test]
2660    fn test_alignment_struct() {
2661        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2662        let sliced = buffer.slice(1);
2663
2664        let child_data = ArrayData {
2665            data_type: DataType::Int32,
2666            len: 0,
2667            offset: 0,
2668            buffers: vec![buffer],
2669            child_data: vec![],
2670            nulls: None,
2671        };
2672
2673        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2674        let mut data = ArrayData {
2675            data_type: schema,
2676            len: 0,
2677            offset: 0,
2678            buffers: vec![],
2679            child_data: vec![child_data],
2680            nulls: None,
2681        };
2682        data.validate_full().unwrap();
2683
2684        // break alignment in child data
2685        data.child_data[0].buffers[0] = sliced;
2686        let err = data.validate().unwrap_err();
2687
2688        assert_eq!(
2689            err.to_string(),
2690            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2691        );
2692
2693        data.align_buffers();
2694        data.validate_full().unwrap();
2695    }
2696
2697    #[test]
2698    fn test_null_view_types() {
2699        let array_len = 32;
2700        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2701        assert_eq!(array.len(), array_len);
2702        for i in 0..array.len() {
2703            assert!(array.is_null(i));
2704        }
2705
2706        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2707        assert_eq!(array.len(), array_len);
2708        for i in 0..array.len() {
2709            assert!(array.is_null(i));
2710        }
2711
2712        let array = ArrayData::new_null(
2713            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2714            array_len,
2715        );
2716        assert_eq!(array.len(), array_len);
2717        for i in 0..array.len() {
2718            assert!(array.is_null(i));
2719        }
2720
2721        let array = ArrayData::new_null(
2722            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2723            array_len,
2724        );
2725        assert_eq!(array.len(), array_len);
2726        for i in 0..array.len() {
2727            assert!(array.is_null(i));
2728        }
2729    }
2730}