Skip to main content

arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            if *size < 0 {
138                panic!("cannot construct buffers from FixedSizeBinary({size})");
139            }
140            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
141        }
142        DataType::Dictionary(k, _) => [
143            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
144            empty_buffer,
145        ],
146        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
147            [empty_buffer, MutableBuffer::new(0)]
148        }
149        DataType::Union(_, mode) => {
150            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
151            match mode {
152                UnionMode::Sparse => [type_ids, empty_buffer],
153                UnionMode::Dense => {
154                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
155                    [type_ids, offsets]
156                }
157            }
158        }
159    }
160}
161
162/// A generic representation of Arrow array data which encapsulates common attributes
163/// and operations for Arrow array.
164///
165/// Specific operations for different arrays types (e.g., primitive, list, struct)
166/// are implemented in `Array`.
167///
168/// # Memory Layout
169///
170/// `ArrayData` has references to one or more underlying data buffers
171/// and optional child ArrayData, depending on type as illustrated
172/// below. Bitmaps are not shown for simplicity but they are stored
173/// similarly to the buffers.
174///
175/// ```text
176///                        offset
177///                       points to
178/// ┌───────────────────┐ start of  ┌───────┐       Different
179/// │                   │   data    │       │     ArrayData may
180/// │ArrayData {        │           │....   │     also refers to
181/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
182/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
183/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
184/// │  buffers: [       │           │5882   │◀─
185/// │    ...            │  │        │4323   │
186/// │  ]                │   ─ ─ ─ ─▶│4859   │
187/// │  child_data: [    │           │....   │
188/// │    ...            │           │       │
189/// │  ]                │           └───────┘
190/// │}                  │
191/// │                   │            Shared Buffer uses
192/// │               │   │            bytes::Bytes to hold
193/// └───────────────────┘            actual data values
194///           ┌ ─ ─ ┘
195///
196///           ▼
197/// ┌───────────────────┐
198/// │ArrayData {        │
199/// │  ...              │
200/// │}                  │
201/// │                   │
202/// └───────────────────┘
203///
204/// Child ArrayData may also have its own buffers and children
205/// ```
206
207#[derive(Debug, Clone)]
208pub struct ArrayData {
209    /// The data type
210    data_type: DataType,
211
212    /// The number of elements
213    len: usize,
214
215    /// The offset in number of items (not bytes).
216    ///
217    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
218    /// does NOT apply to [`Self::nulls`].
219    offset: usize,
220
221    /// The buffers that store the actual data for this array, as defined
222    /// in the [Arrow Spec].
223    ///
224    /// Depending on the array types, [`Self::buffers`] can hold different
225    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
226    /// positions.
227    ///
228    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
229    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
230    /// we need to satisfy `len`.
231    ///
232    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
233    buffers: Vec<Buffer>,
234
235    /// The child(ren) of this array.
236    ///
237    /// Only non-empty for nested types, such as `ListArray` and
238    /// `StructArray`.
239    ///
240    /// The first logical element in each child element begins at `offset`.
241    ///
242    /// If the child element also has an offset then these offsets are
243    /// cumulative.
244    child_data: Vec<ArrayData>,
245
246    /// The null bitmap.
247    ///
248    /// `None` indicates all values are non-null in this array.
249    ///
250    /// [`Self::offset]` does not apply to the null bitmap. While the
251    /// BooleanBuffer may be sliced (have its own offset) internally, this
252    /// `NullBuffer` always represents exactly `len` elements.
253    nulls: Option<NullBuffer>,
254}
255
256/// A thread-safe, shared reference to the Arrow array data.
257pub type ArrayDataRef = Arc<ArrayData>;
258
259fn checked_len_plus_offset(
260    data_type: &DataType,
261    len: usize,
262    offset: usize,
263) -> Result<usize, ArrowError> {
264    len.checked_add(offset).ok_or_else(|| {
265        ArrowError::InvalidArgumentError(format!(
266            "Length {len} with offset {offset} overflows usize for {data_type}"
267        ))
268    })
269}
270
271impl ArrayData {
272    /// Create a new ArrayData instance;
273    ///
274    /// If `null_count` is not specified, the number of nulls in
275    /// null_bit_buffer is calculated.
276    ///
277    /// If the number of nulls is 0 then the null_bit_buffer
278    /// is set to `None`.
279    ///
280    /// # Safety
281    ///
282    /// The input values *must* form a valid Arrow array for
283    /// `data_type`, or undefined behavior can result.
284    ///
285    /// Note: This is a low level API and most users of the arrow
286    /// crate should create arrays using the methods in the `array`
287    /// module.
288    pub unsafe fn new_unchecked(
289        data_type: DataType,
290        len: usize,
291        null_count: Option<usize>,
292        null_bit_buffer: Option<Buffer>,
293        offset: usize,
294        buffers: Vec<Buffer>,
295        child_data: Vec<ArrayData>,
296    ) -> Self {
297        let builder = Self::inner_new_builder(
298            data_type,
299            len,
300            null_count,
301            null_bit_buffer,
302            offset,
303            buffers,
304            child_data,
305        );
306
307        // SAFETY: caller responsible for ensuring data is valid
308        unsafe { builder.build_unchecked() }
309    }
310
311    /// Create a new ArrayData, validating that the provided buffers form a valid
312    /// Arrow array of the specified data type.
313    ///
314    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
315    /// is set to `None`.
316    ///
317    /// Internally this calls through to [`Self::validate_data`]
318    ///
319    /// Note: This is a low level API and most users of the arrow crate should create
320    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
321    /// or [`ArrayDataBuilder`].
322    ///
323    /// See also [`Self::into_parts`] to recover the fields
324    pub fn try_new(
325        data_type: DataType,
326        len: usize,
327        null_bit_buffer: Option<Buffer>,
328        offset: usize,
329        buffers: Vec<Buffer>,
330        child_data: Vec<ArrayData>,
331    ) -> Result<Self, ArrowError> {
332        // we must check the length of `null_bit_buffer` first
333        // because we use this buffer to calculate `null_count`
334        // in `ArrayDataBuilder::build`.
335        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
336            let len_plus_offset = checked_len_plus_offset(&data_type, len, offset)?;
337            let needed_len = bit_util::ceil(len_plus_offset, 8);
338            if null_bit_buffer.len() < needed_len {
339                return Err(ArrowError::InvalidArgumentError(format!(
340                    "null_bit_buffer size too small. got {} needed {}",
341                    null_bit_buffer.len(),
342                    needed_len
343                )));
344            }
345        }
346
347        let builder = Self::inner_new_builder(
348            data_type,
349            len,
350            None,
351            null_bit_buffer,
352            offset,
353            buffers,
354            child_data,
355        );
356
357        assert!(!builder.skip_validation.get());
358
359        // As the data is not trusted, do a full validation of its contents
360        // We don't need to validate children as we can assume that the
361        // [`ArrayData`] in `child_data` have already been validated through
362        // a call to `ArrayData::try_new` or created using unsafe
363        builder.build()
364    }
365
366    fn inner_new_builder(
367        data_type: DataType,
368        len: usize,
369        null_count: Option<usize>,
370        null_bit_buffer: Option<Buffer>,
371        offset: usize,
372        buffers: Vec<Buffer>,
373        child_data: Vec<ArrayData>,
374    ) -> ArrayDataBuilder {
375        ArrayDataBuilder {
376            data_type,
377            len,
378            null_count,
379            null_bit_buffer,
380            nulls: None,
381            offset,
382            buffers,
383            child_data,
384            align_buffers: false,
385            skip_validation: UnsafeFlag::new(),
386        }
387    }
388
389    /// Return the constituent parts of this ArrayData
390    ///
391    /// This is the inverse of [`ArrayData::try_new`].
392    ///
393    /// Returns `(data_type, len, nulls, offset, buffers, child_data)`
394    pub fn into_parts(
395        self,
396    ) -> (
397        DataType,
398        usize,
399        Option<NullBuffer>,
400        usize,
401        Vec<Buffer>,
402        Vec<ArrayData>,
403    ) {
404        let Self {
405            data_type,
406            len,
407            nulls,
408            offset,
409            buffers,
410            child_data,
411        } = self;
412
413        (data_type, len, nulls, offset, buffers, child_data)
414    }
415
416    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
417    #[inline]
418    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
419        ArrayDataBuilder::new(data_type)
420    }
421
422    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
423    #[inline]
424    pub const fn data_type(&self) -> &DataType {
425        &self.data_type
426    }
427
428    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
429    pub fn buffers(&self) -> &[Buffer] {
430        &self.buffers
431    }
432
433    /// Returns a slice of children [`ArrayData`]. This will be non
434    /// empty for type such as lists and structs.
435    pub fn child_data(&self) -> &[ArrayData] {
436        &self.child_data[..]
437    }
438
439    /// Returns whether the element at index `i` is null
440    #[inline]
441    pub fn is_null(&self, i: usize) -> bool {
442        match &self.nulls {
443            Some(v) => v.is_null(i),
444            None => false,
445        }
446    }
447
448    /// Returns a reference to the null buffer of this [`ArrayData`] if any
449    ///
450    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
451    #[inline]
452    pub fn nulls(&self) -> Option<&NullBuffer> {
453        self.nulls.as_ref()
454    }
455
456    /// Returns whether the element at index `i` is not null
457    #[inline]
458    pub fn is_valid(&self, i: usize) -> bool {
459        !self.is_null(i)
460    }
461
462    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
463    #[inline]
464    pub const fn len(&self) -> usize {
465        self.len
466    }
467
468    /// Returns whether this [`ArrayData`] is empty
469    #[inline]
470    pub const fn is_empty(&self) -> bool {
471        self.len == 0
472    }
473
474    /// Returns the offset of this [`ArrayData`]
475    #[inline]
476    pub const fn offset(&self) -> usize {
477        self.offset
478    }
479
480    /// Returns the total number of nulls in this array
481    #[inline]
482    pub fn null_count(&self) -> usize {
483        self.nulls
484            .as_ref()
485            .map(|x| x.null_count())
486            .unwrap_or_default()
487    }
488
489    /// Returns the total number of bytes of memory occupied by the
490    /// buffers owned by this [`ArrayData`] and all of its
491    /// children. (See also diagram on [`ArrayData`]).
492    ///
493    /// Note that this [`ArrayData`] may only refer to a subset of the
494    /// data in the underlying [`Buffer`]s (due to `offset` and
495    /// `length`), but the size returned includes the entire size of
496    /// the buffers.
497    ///
498    /// If multiple [`ArrayData`]s refer to the same underlying
499    /// [`Buffer`]s they will both report the same size.
500    pub fn get_buffer_memory_size(&self) -> usize {
501        let mut size = 0;
502        for buffer in &self.buffers {
503            size += buffer.capacity();
504        }
505        if let Some(bitmap) = &self.nulls {
506            size += bitmap.buffer().capacity()
507        }
508        for child in &self.child_data {
509            size += child.get_buffer_memory_size();
510        }
511        size
512    }
513
514    /// Returns the total number of the bytes of memory occupied by
515    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
516    ///
517    /// This is approximately the number of bytes if a new
518    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
519    /// exactly the data needed.
520    ///
521    /// For example, a [`DataType::Int64`] with `100` elements,
522    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
523    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
524    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
525    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
526    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
527        let mut result: usize = 0;
528        let layout = layout(&self.data_type);
529
530        for spec in layout.buffers.iter() {
531            match spec {
532                BufferSpec::FixedWidth { byte_width, .. } => {
533                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
534                        ArrowError::ComputeError(
535                            "Integer overflow computing buffer size".to_string(),
536                        )
537                    })?;
538                    result += buffer_size;
539                }
540                BufferSpec::VariableWidth => {
541                    let buffer_len = match self.data_type {
542                        DataType::Utf8 | DataType::Binary => {
543                            let offsets = self.typed_offsets::<i32>()?;
544                            (offsets[self.len] - offsets[0]) as usize
545                        }
546                        DataType::LargeUtf8 | DataType::LargeBinary => {
547                            let offsets = self.typed_offsets::<i64>()?;
548                            (offsets[self.len] - offsets[0]) as usize
549                        }
550                        _ => {
551                            return Err(ArrowError::NotYetImplemented(format!(
552                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
553                                self.data_type
554                            )));
555                        }
556                    };
557                    result += buffer_len;
558                }
559                BufferSpec::BitMap => {
560                    let buffer_size = bit_util::ceil(self.len, 8);
561                    result += buffer_size;
562                }
563                BufferSpec::AlwaysNull => {
564                    // Nothing to do
565                }
566            }
567        }
568
569        if self.nulls().is_some() {
570            result += bit_util::ceil(self.len, 8);
571        }
572
573        for child in &self.child_data {
574            result += child.get_slice_memory_size()?;
575        }
576        Ok(result)
577    }
578
579    /// Returns the total number of bytes of memory occupied
580    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
581    /// children. (See also diagram on [`ArrayData`]).
582    ///
583    /// Equivalent to:
584    ///  `size_of_val(self)` +
585    ///  [`Self::get_buffer_memory_size`] +
586    ///  `size_of_val(child)` for all children
587    pub fn get_array_memory_size(&self) -> usize {
588        let mut size = mem::size_of_val(self);
589
590        // Calculate rest of the fields top down which contain actual data
591        for buffer in &self.buffers {
592            size += mem::size_of::<Buffer>();
593            size += buffer.capacity();
594        }
595        if let Some(nulls) = &self.nulls {
596            size += nulls.buffer().capacity();
597        }
598        for child in &self.child_data {
599            size += child.get_array_memory_size();
600        }
601
602        size
603    }
604
605    /// Creates a zero-copy slice of itself. This creates a new
606    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
607    /// different offset and len
608    ///
609    /// # Panics
610    ///
611    /// Panics if `offset + length` overflows or is greater than `self.len()`.
612    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
613        let end = offset
614            .checked_add(length)
615            .expect("offset + length overflow");
616        assert!(end <= self.len());
617
618        if let DataType::Struct(_) = self.data_type() {
619            // Slice into children
620            let new_offset = self.offset + offset;
621            ArrayData {
622                data_type: self.data_type().clone(),
623                len: length,
624                offset: new_offset,
625                buffers: self.buffers.clone(),
626                // Slice child data, to propagate offsets down to them
627                child_data: self
628                    .child_data()
629                    .iter()
630                    .map(|data| data.slice(offset, length))
631                    .collect(),
632                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
633            }
634        } else {
635            let mut new_data = self.clone();
636
637            new_data.len = length;
638            new_data.offset = offset + self.offset;
639            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
640
641            new_data
642        }
643    }
644
645    /// Returns the `buffer` as a slice of type `T` starting at self.offset
646    ///
647    /// # Panics
648    /// This function panics if:
649    /// * the buffer is not byte-aligned with type T, or
650    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
651    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
652        &self.buffers()[buffer].typed_data()[self.offset..]
653    }
654
655    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
656    ///
657    /// # Panics
658    /// This function panics if:
659    /// * the datatype `data_type` has incorrect layout
660    pub fn new_null(data_type: &DataType, len: usize) -> Self {
661        let bit_len = bit_util::ceil(len, 8);
662        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
663
664        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
665            Some(width) => (vec![zeroed(width * len)], vec![], true),
666            None => match data_type {
667                DataType::Null => (vec![], vec![], false),
668                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
669                DataType::Binary | DataType::Utf8 => {
670                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
671                }
672                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
673                DataType::LargeBinary | DataType::LargeUtf8 => {
674                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
675                }
676                DataType::FixedSizeBinary(i) => {
677                    if *i < 0 {
678                        panic!("cannot construct null data from FixedSizeBinary({i})");
679                    }
680                    (vec![zeroed(*i as usize * len)], vec![], true)
681                }
682                DataType::List(f) | DataType::Map(f, _) => (
683                    vec![zeroed((len + 1) * 4)],
684                    vec![ArrayData::new_empty(f.data_type())],
685                    true,
686                ),
687                DataType::LargeList(f) => (
688                    vec![zeroed((len + 1) * 8)],
689                    vec![ArrayData::new_empty(f.data_type())],
690                    true,
691                ),
692                DataType::ListView(f) => (
693                    vec![zeroed(len * 4), zeroed(len * 4)],
694                    vec![ArrayData::new_empty(f.data_type())],
695                    true,
696                ),
697                DataType::LargeListView(f) => (
698                    vec![zeroed(len * 8), zeroed(len * 8)],
699                    vec![ArrayData::new_empty(f.data_type())],
700                    true,
701                ),
702                DataType::FixedSizeList(f, list_len) => (
703                    vec![],
704                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
705                    true,
706                ),
707                DataType::Struct(fields) => (
708                    vec![],
709                    fields
710                        .iter()
711                        .map(|f| Self::new_null(f.data_type(), len))
712                        .collect(),
713                    true,
714                ),
715                DataType::Dictionary(k, v) => (
716                    vec![zeroed(k.primitive_width().unwrap() * len)],
717                    vec![ArrayData::new_empty(v.as_ref())],
718                    true,
719                ),
720                DataType::Union(f, mode) => {
721                    let (id, _) = f.iter().next().unwrap();
722                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
723                    let buffers = match mode {
724                        UnionMode::Sparse => vec![ids],
725                        UnionMode::Dense => {
726                            let end_offset = i32::from_usize(len).unwrap();
727                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
728                        }
729                    };
730
731                    let children = f
732                        .iter()
733                        .enumerate()
734                        .map(|(idx, (_, f))| {
735                            if idx == 0 || *mode == UnionMode::Sparse {
736                                Self::new_null(f.data_type(), len)
737                            } else {
738                                Self::new_empty(f.data_type())
739                            }
740                        })
741                        .collect();
742
743                    (buffers, children, false)
744                }
745                DataType::RunEndEncoded(r, v) => {
746                    if len == 0 {
747                        // For empty arrays, create zero-length child arrays.
748                        let runs = ArrayData::new_empty(r.data_type());
749                        let values = ArrayData::new_empty(v.data_type());
750                        (vec![], vec![runs, values], false)
751                    } else {
752                        let runs = match r.data_type() {
753                            DataType::Int16 => {
754                                let i = i16::from_usize(len).expect("run overflow");
755                                Buffer::from_slice_ref([i])
756                            }
757                            DataType::Int32 => {
758                                let i = i32::from_usize(len).expect("run overflow");
759                                Buffer::from_slice_ref([i])
760                            }
761                            DataType::Int64 => {
762                                let i = i64::from_usize(len).expect("run overflow");
763                                Buffer::from_slice_ref([i])
764                            }
765                            dt => unreachable!("Invalid run ends data type {dt}"),
766                        };
767
768                        let builder = ArrayData::builder(r.data_type().clone())
769                            .len(1)
770                            .buffers(vec![runs]);
771
772                        // SAFETY:
773                        // Valid by construction
774                        let runs = unsafe { builder.build_unchecked() };
775                        (
776                            vec![],
777                            vec![runs, ArrayData::new_null(v.data_type(), 1)],
778                            false,
779                        )
780                    }
781                }
782                // Handled by Some(width) branch above
783                DataType::Int8
784                | DataType::Int16
785                | DataType::Int32
786                | DataType::Int64
787                | DataType::UInt8
788                | DataType::UInt16
789                | DataType::UInt32
790                | DataType::UInt64
791                | DataType::Float16
792                | DataType::Float32
793                | DataType::Float64
794                | DataType::Timestamp(_, _)
795                | DataType::Date32
796                | DataType::Date64
797                | DataType::Time32(_)
798                | DataType::Time64(_)
799                | DataType::Duration(_)
800                | DataType::Interval(_)
801                | DataType::Decimal32(_, _)
802                | DataType::Decimal64(_, _)
803                | DataType::Decimal128(_, _)
804                | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
805            },
806        };
807
808        let mut builder = ArrayDataBuilder::new(data_type.clone())
809            .len(len)
810            .buffers(buffers)
811            .child_data(child_data);
812
813        if has_nulls {
814            builder = builder.nulls(Some(NullBuffer::new_null(len)))
815        }
816
817        // SAFETY:
818        // Data valid by construction
819        unsafe { builder.build_unchecked() }
820    }
821
822    /// Returns a new empty [ArrayData] valid for `data_type`.
823    pub fn new_empty(data_type: &DataType) -> Self {
824        Self::new_null(data_type, 0)
825    }
826
827    /// Verifies that the buffers meet the minimum alignment requirements for the data type
828    ///
829    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
830    ///
831    /// This can be useful for when interacting with data sent over IPC or FFI, that may
832    /// not meet the minimum alignment requirements
833    ///
834    /// This also aligns buffers of children data
835    pub fn align_buffers(&mut self) {
836        let layout = layout(&self.data_type);
837        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
838            if let BufferSpec::FixedWidth { alignment, .. } = spec {
839                if buffer.as_ptr().align_offset(*alignment) != 0 {
840                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
841                }
842            }
843        }
844        // align children data recursively
845        for data in self.child_data.iter_mut() {
846            data.align_buffers()
847        }
848    }
849
850    /// "cheap" validation of an `ArrayData`. Ensures buffers are
851    /// sufficiently sized to store `len` + `offset` total elements of
852    /// `data_type` and performs other inexpensive consistency checks.
853    ///
854    /// This check is "cheap" in the sense that it does not validate the
855    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
856    /// are within the bounds of the values buffer).
857    ///
858    /// See [ArrayData::validate_data] to validate fully the offset content
859    /// and the validity of utf8 data
860    pub fn validate(&self) -> Result<(), ArrowError> {
861        // Need at least this much space in each buffer
862        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
863
864        // Check that the data layout conforms to the spec
865        let layout = layout(&self.data_type);
866
867        if !layout.can_contain_null_mask && self.nulls.is_some() {
868            return Err(ArrowError::InvalidArgumentError(format!(
869                "Arrays of type {:?} cannot contain a null bitmask",
870                self.data_type,
871            )));
872        }
873
874        // Check data buffers length for view types and other types
875        if self.buffers.len() < layout.buffers.len()
876            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
877        {
878            return Err(ArrowError::InvalidArgumentError(format!(
879                "Expected {} buffers in array of type {:?}, got {}",
880                layout.buffers.len(),
881                self.data_type,
882                self.buffers.len(),
883            )));
884        }
885
886        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
887            match spec {
888                BufferSpec::FixedWidth {
889                    byte_width,
890                    alignment,
891                } => {
892                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
893
894                    if buffer.len() < min_buffer_size {
895                        return Err(ArrowError::InvalidArgumentError(format!(
896                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
897                            min_buffer_size,
898                            i,
899                            self.data_type,
900                            buffer.len()
901                        )));
902                    }
903
904                    let align_offset = buffer.as_ptr().align_offset(*alignment);
905                    if align_offset != 0 {
906                        return Err(ArrowError::InvalidArgumentError(format!(
907                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
908                            self.data_type,
909                            align_offset.min(alignment - align_offset)
910                        )));
911                    }
912                }
913                BufferSpec::VariableWidth => {
914                    // not cheap to validate (need to look at the
915                    // data). Partially checked in validate_offsets
916                    // called below. Can check with `validate_full`
917                }
918                BufferSpec::BitMap => {
919                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
920                    if buffer.len() < min_buffer_size {
921                        return Err(ArrowError::InvalidArgumentError(format!(
922                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
923                            min_buffer_size,
924                            i,
925                            self.data_type,
926                            buffer.len()
927                        )));
928                    }
929                }
930                BufferSpec::AlwaysNull => {
931                    // Nothing to validate
932                }
933            }
934        }
935
936        // check null bit buffer size
937        if let Some(nulls) = self.nulls() {
938            if nulls.null_count() > self.len {
939                return Err(ArrowError::InvalidArgumentError(format!(
940                    "null_count {} for an array exceeds length of {} elements",
941                    nulls.null_count(),
942                    self.len
943                )));
944            }
945
946            let actual_len = nulls.validity().len();
947            let needed_len = bit_util::ceil(len_plus_offset, 8);
948            if actual_len < needed_len {
949                return Err(ArrowError::InvalidArgumentError(format!(
950                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
951                )));
952            }
953
954            if nulls.len() != self.len {
955                return Err(ArrowError::InvalidArgumentError(format!(
956                    "null buffer incorrect size. got {} expected {}",
957                    nulls.len(),
958                    self.len
959                )));
960            }
961        }
962
963        self.validate_child_data()?;
964
965        // Additional Type specific checks
966        match &self.data_type {
967            DataType::Utf8 | DataType::Binary => {
968                self.validate_offsets::<i32>(self.buffers[1].len())?;
969            }
970            DataType::LargeUtf8 | DataType::LargeBinary => {
971                self.validate_offsets::<i64>(self.buffers[1].len())?;
972            }
973            DataType::Dictionary(key_type, _value_type) => {
974                // At the moment, constructing a DictionaryArray will also check this
975                if !DataType::is_dictionary_key_type(key_type) {
976                    return Err(ArrowError::InvalidArgumentError(format!(
977                        "Dictionary key type must be integer, but was {key_type}"
978                    )));
979                }
980            }
981            DataType::RunEndEncoded(run_ends_type, _) => {
982                if run_ends_type.is_nullable() {
983                    return Err(ArrowError::InvalidArgumentError(
984                        "The nullable should be set to false for the field defining run_ends array.".to_string()
985                    ));
986                }
987                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
988                    return Err(ArrowError::InvalidArgumentError(format!(
989                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
990                        run_ends_type.data_type()
991                    )));
992                }
993            }
994            _ => {}
995        };
996
997        Ok(())
998    }
999
1000    /// Returns a reference to the data in `buffer` as a typed slice
1001    /// (typically `&[i32]` or `&[i64]`) after validating. The
1002    /// returned slice is guaranteed to have at least `self.len + 1`
1003    /// entries.
1004    ///
1005    /// For an empty array, the `buffer` can also be empty.
1006    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
1007        // An empty list-like array can have 0 offsets
1008        if self.len == 0 && self.buffers[0].is_empty() {
1009            return Ok(&[]);
1010        }
1011
1012        let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
1013
1014        self.typed_buffer(0, len)
1015    }
1016
1017    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
1018    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
1019        &self,
1020        idx: usize,
1021        len: usize,
1022    ) -> Result<&[T], ArrowError> {
1023        let buffer = &self.buffers[idx];
1024
1025        let required_elements = checked_len_plus_offset(&self.data_type, len, self.offset)?;
1026        let byte_width = mem::size_of::<T>();
1027        let required_len = required_elements.checked_mul(byte_width).ok_or_else(|| {
1028            ArrowError::InvalidArgumentError(format!(
1029                "Buffer {idx} of {} byte length overflow: {} elements of {} bytes exceeds usize",
1030                self.data_type, required_elements, byte_width
1031            ))
1032        })?;
1033
1034        if buffer.len() < required_len {
1035            return Err(ArrowError::InvalidArgumentError(format!(
1036                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
1037                idx,
1038                self.data_type,
1039                required_len,
1040                buffer.len()
1041            )));
1042        }
1043
1044        Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
1045    }
1046
1047    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1048    /// offsets (of type T) into some other buffer of `values_length` bytes long
1049    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1050        &self,
1051        values_length: usize,
1052    ) -> Result<(), ArrowError> {
1053        // Justification: buffer size was validated above
1054        let offsets = self.typed_offsets::<T>()?;
1055        if offsets.is_empty() {
1056            return Ok(());
1057        }
1058
1059        let first_offset = offsets[0].to_usize().ok_or_else(|| {
1060            ArrowError::InvalidArgumentError(format!(
1061                "Error converting offset[0] ({}) to usize for {}",
1062                offsets[0], self.data_type
1063            ))
1064        })?;
1065
1066        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1067            ArrowError::InvalidArgumentError(format!(
1068                "Error converting offset[{}] ({}) to usize for {}",
1069                self.len, offsets[self.len], self.data_type
1070            ))
1071        })?;
1072
1073        if first_offset > values_length {
1074            return Err(ArrowError::InvalidArgumentError(format!(
1075                "First offset {} of {} is larger than values length {}",
1076                first_offset, self.data_type, values_length,
1077            )));
1078        }
1079
1080        if last_offset > values_length {
1081            return Err(ArrowError::InvalidArgumentError(format!(
1082                "Last offset {} of {} is larger than values length {}",
1083                last_offset, self.data_type, values_length,
1084            )));
1085        }
1086
1087        if first_offset > last_offset {
1088            return Err(ArrowError::InvalidArgumentError(format!(
1089                "First offset {} in {} is smaller than last offset {}",
1090                first_offset, self.data_type, last_offset,
1091            )));
1092        }
1093
1094        Ok(())
1095    }
1096
1097    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1098    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
1099    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1100        &self,
1101        values_length: usize,
1102    ) -> Result<(), ArrowError> {
1103        let offsets: &[T] = self.typed_buffer(0, self.len)?;
1104        let sizes: &[T] = self.typed_buffer(1, self.len)?;
1105        if offsets.len() != sizes.len() {
1106            return Err(ArrowError::ComputeError(format!(
1107                "ListView offsets len {} does not match sizes len {}",
1108                offsets.len(),
1109                sizes.len()
1110            )));
1111        }
1112
1113        for i in 0..sizes.len() {
1114            let size = sizes[i].to_usize().ok_or_else(|| {
1115                ArrowError::InvalidArgumentError(format!(
1116                    "Error converting size[{}] ({}) to usize for {}",
1117                    i, sizes[i], self.data_type
1118                ))
1119            })?;
1120            let offset = offsets[i].to_usize().ok_or_else(|| {
1121                ArrowError::InvalidArgumentError(format!(
1122                    "Error converting offset[{}] ({}) to usize for {}",
1123                    i, offsets[i], self.data_type
1124                ))
1125            })?;
1126            if size
1127                .checked_add(offset)
1128                .expect("Offset and size have exceeded the usize boundary")
1129                > values_length
1130            {
1131                return Err(ArrowError::InvalidArgumentError(format!(
1132                    "Size {} at index {} is larger than the remaining values for {}",
1133                    size, i, self.data_type
1134                )));
1135            }
1136        }
1137        Ok(())
1138    }
1139
1140    /// Validates the layout of `child_data` ArrayData structures
1141    fn validate_child_data(&self) -> Result<(), ArrowError> {
1142        match &self.data_type {
1143            DataType::List(field) | DataType::Map(field, _) => {
1144                let values_data = self.get_single_valid_child_data(field.data_type())?;
1145                self.validate_offsets::<i32>(values_data.len)?;
1146                Ok(())
1147            }
1148            DataType::LargeList(field) => {
1149                let values_data = self.get_single_valid_child_data(field.data_type())?;
1150                self.validate_offsets::<i64>(values_data.len)?;
1151                Ok(())
1152            }
1153            DataType::ListView(field) => {
1154                let values_data = self.get_single_valid_child_data(field.data_type())?;
1155                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1156                Ok(())
1157            }
1158            DataType::LargeListView(field) => {
1159                let values_data = self.get_single_valid_child_data(field.data_type())?;
1160                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1161                Ok(())
1162            }
1163            DataType::FixedSizeList(field, list_size) => {
1164                let values_data = self.get_single_valid_child_data(field.data_type())?;
1165
1166                let list_size: usize = (*list_size).try_into().map_err(|_| {
1167                    ArrowError::InvalidArgumentError(format!(
1168                        "{} has a negative list_size {}",
1169                        self.data_type, list_size
1170                    ))
1171                })?;
1172
1173                let expected_values_len = self.len
1174                    .checked_mul(list_size)
1175                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1176
1177                if values_data.len < expected_values_len {
1178                    return Err(ArrowError::InvalidArgumentError(format!(
1179                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1180                        values_data.len, self.len, list_size, self.data_type
1181                    )));
1182                }
1183
1184                Ok(())
1185            }
1186            DataType::Struct(fields) => {
1187                self.validate_num_child_data(fields.len())?;
1188                for (i, field) in fields.iter().enumerate() {
1189                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1190
1191                    // Ensure child field has sufficient size
1192                    if field_data.len < self.len {
1193                        return Err(ArrowError::InvalidArgumentError(format!(
1194                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1195                            self.data_type,
1196                            i,
1197                            field.name(),
1198                            field_data.len,
1199                            self.len
1200                        )));
1201                    }
1202                }
1203                Ok(())
1204            }
1205            DataType::RunEndEncoded(run_ends_field, values_field) => {
1206                self.validate_num_child_data(2)?;
1207                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1208                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1209                if run_ends_data.len != values_data.len {
1210                    return Err(ArrowError::InvalidArgumentError(format!(
1211                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1212                        run_ends_data.len, values_data.len
1213                    )));
1214                }
1215                if run_ends_data.nulls.is_some() {
1216                    return Err(ArrowError::InvalidArgumentError(
1217                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1218                    ));
1219                }
1220                Ok(())
1221            }
1222            DataType::Union(fields, mode) => {
1223                self.validate_num_child_data(fields.len())?;
1224
1225                for (i, (_, field)) in fields.iter().enumerate() {
1226                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1227
1228                    if mode == &UnionMode::Sparse {
1229                        let len_plus_offset =
1230                            checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1231                        if field_data.len < len_plus_offset {
1232                            return Err(ArrowError::InvalidArgumentError(format!(
1233                                "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1234                                i, field_data.len, len_plus_offset
1235                            )));
1236                        }
1237                    }
1238                }
1239                Ok(())
1240            }
1241            DataType::Dictionary(_key_type, value_type) => {
1242                self.get_single_valid_child_data(value_type)?;
1243                Ok(())
1244            }
1245            _ => {
1246                // other types do not have child data
1247                if !self.child_data.is_empty() {
1248                    return Err(ArrowError::InvalidArgumentError(format!(
1249                        "Expected no child arrays for type {} but got {}",
1250                        self.data_type,
1251                        self.child_data.len()
1252                    )));
1253                }
1254                Ok(())
1255            }
1256        }
1257    }
1258
1259    /// Ensures that this array data has a single child_data with the
1260    /// expected type, and calls `validate()` on it. Returns a
1261    /// reference to that child_data
1262    fn get_single_valid_child_data(
1263        &self,
1264        expected_type: &DataType,
1265    ) -> Result<&ArrayData, ArrowError> {
1266        self.validate_num_child_data(1)?;
1267        self.get_valid_child_data(0, expected_type)
1268    }
1269
1270    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1271    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1272        if self.child_data.len() != expected_len {
1273            Err(ArrowError::InvalidArgumentError(format!(
1274                "Value data for {} should contain {} child data array(s), had {}",
1275                self.data_type,
1276                expected_len,
1277                self.child_data.len()
1278            )))
1279        } else {
1280            Ok(())
1281        }
1282    }
1283
1284    /// Ensures that `child_data[i]` has the expected type, calls
1285    /// `validate()` on it, and returns a reference to that child_data
1286    fn get_valid_child_data(
1287        &self,
1288        i: usize,
1289        expected_type: &DataType,
1290    ) -> Result<&ArrayData, ArrowError> {
1291        let values_data = self.child_data.get(i).ok_or_else(|| {
1292            ArrowError::InvalidArgumentError(format!(
1293                "{} did not have enough child arrays. Expected at least {} but had only {}",
1294                self.data_type,
1295                i + 1,
1296                self.child_data.len()
1297            ))
1298        })?;
1299
1300        if expected_type != &values_data.data_type {
1301            return Err(ArrowError::InvalidArgumentError(format!(
1302                "Child type mismatch for {}. Expected {} but child data had {}",
1303                self.data_type, expected_type, values_data.data_type
1304            )));
1305        }
1306
1307        values_data.validate()?;
1308        Ok(values_data)
1309    }
1310
1311    /// Validate that the data contained within this [`ArrayData`] is valid
1312    ///
1313    /// 1. Null count is correct
1314    /// 2. All offsets are valid
1315    /// 3. All String data is valid UTF-8
1316    /// 4. All dictionary offsets are valid
1317    ///
1318    /// Internally this calls:
1319    ///
1320    /// * [`Self::validate`]
1321    /// * [`Self::validate_nulls`]
1322    /// * [`Self::validate_values`]
1323    ///
1324    /// Note: this does not recurse into children, for a recursive variant
1325    /// see [`Self::validate_full`]
1326    pub fn validate_data(&self) -> Result<(), ArrowError> {
1327        self.validate()?;
1328
1329        self.validate_nulls()?;
1330        self.validate_values()?;
1331        Ok(())
1332    }
1333
1334    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1335    ///
1336    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1337    /// and all its children recursively
1338    pub fn validate_full(&self) -> Result<(), ArrowError> {
1339        self.validate_data()?;
1340        // validate all children recursively
1341        self.child_data
1342            .iter()
1343            .enumerate()
1344            .try_for_each(|(i, child_data)| {
1345                child_data.validate_full().map_err(|e| {
1346                    ArrowError::InvalidArgumentError(format!(
1347                        "{} child #{} invalid: {}",
1348                        self.data_type, i, e
1349                    ))
1350                })
1351            })?;
1352        Ok(())
1353    }
1354
1355    /// Validates the values stored within this [`ArrayData`] are valid
1356    /// without recursing into child [`ArrayData`]
1357    ///
1358    /// Does not (yet) check
1359    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1360    /// 2. the the null count is correct and that any
1361    /// 3. nullability requirements of its children are correct
1362    ///
1363    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1364    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1365        if let Some(nulls) = &self.nulls {
1366            let actual = nulls.len() - nulls.inner().count_set_bits();
1367            if actual != nulls.null_count() {
1368                return Err(ArrowError::InvalidArgumentError(format!(
1369                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1370                    nulls.null_count(),
1371                    actual
1372                )));
1373            }
1374        }
1375
1376        // In general non-nullable children should not contain nulls, however, for certain
1377        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1378        // space in the child. As such we permit nulls in the children in the corresponding
1379        // positions for such types
1380        match &self.data_type {
1381            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1382                if !f.is_nullable() {
1383                    self.validate_non_nullable(None, &self.child_data[0])?
1384                }
1385            }
1386            DataType::FixedSizeList(field, len) => {
1387                let child = &self.child_data[0];
1388                if !field.is_nullable() {
1389                    match &self.nulls {
1390                        Some(nulls) => {
1391                            let element_len = *len as usize;
1392                            let expanded = nulls.expand(element_len);
1393                            self.validate_non_nullable(Some(&expanded), child)?;
1394                        }
1395                        None => self.validate_non_nullable(None, child)?,
1396                    }
1397                }
1398            }
1399            DataType::Struct(fields) => {
1400                for (field, child) in fields.iter().zip(&self.child_data) {
1401                    if !field.is_nullable() {
1402                        self.validate_non_nullable(self.nulls(), child)?
1403                    }
1404                }
1405            }
1406            _ => {}
1407        }
1408
1409        Ok(())
1410    }
1411
1412    /// Verifies that `child` contains no nulls not present in `mask`
1413    fn validate_non_nullable(
1414        &self,
1415        mask: Option<&NullBuffer>,
1416        child: &ArrayData,
1417    ) -> Result<(), ArrowError> {
1418        let mask = match mask {
1419            Some(mask) => mask,
1420            None => {
1421                return match child.null_count() {
1422                    0 => Ok(()),
1423                    _ => Err(ArrowError::InvalidArgumentError(format!(
1424                        "non-nullable child of type {} contains nulls not present in parent {}",
1425                        child.data_type, self.data_type
1426                    ))),
1427                };
1428            }
1429        };
1430
1431        match child.nulls() {
1432            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1433                "non-nullable child of type {} contains nulls not present in parent",
1434                child.data_type
1435            ))),
1436            _ => Ok(()),
1437        }
1438    }
1439
1440    /// Validates the values stored within this [`ArrayData`] are valid
1441    /// without recursing into child [`ArrayData`]
1442    ///
1443    /// Does not (yet) check
1444    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1445    pub fn validate_values(&self) -> Result<(), ArrowError> {
1446        match &self.data_type {
1447            DataType::Utf8 => self.validate_utf8::<i32>(),
1448            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1449            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1450            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1451            DataType::BinaryView => {
1452                let views = self.typed_buffer::<u128>(0, self.len)?;
1453                validate_binary_view(views, &self.buffers[1..])
1454            }
1455            DataType::Utf8View => {
1456                let views = self.typed_buffer::<u128>(0, self.len)?;
1457                validate_string_view(views, &self.buffers[1..])
1458            }
1459            DataType::List(_) | DataType::Map(_, _) => {
1460                let child = &self.child_data[0];
1461                self.validate_offsets_full::<i32>(child.len)
1462            }
1463            DataType::LargeList(_) => {
1464                let child = &self.child_data[0];
1465                self.validate_offsets_full::<i64>(child.len)
1466            }
1467            DataType::Union(_, _) => {
1468                // Validate Union Array as part of implementing new Union semantics
1469                // See comments in `ArrayData::validate()`
1470                // https://github.com/apache/arrow-rs/issues/85
1471                //
1472                // TODO file follow on ticket for full union validation
1473                Ok(())
1474            }
1475            DataType::Dictionary(key_type, _value_type) => {
1476                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1477                let max_value = dictionary_length - 1;
1478                match key_type.as_ref() {
1479                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1480                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1481                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1482                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1483                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1484                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1485                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1486                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1487                    _ => unreachable!(),
1488                }
1489            }
1490            DataType::RunEndEncoded(run_ends, _values) => {
1491                let run_ends_data = self.child_data()[0].clone();
1492                match run_ends.data_type() {
1493                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1494                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1495                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1496                    _ => unreachable!(),
1497                }
1498            }
1499            _ => {
1500                // No extra validation check required for other types
1501                Ok(())
1502            }
1503        }
1504    }
1505
1506    /// Calls the `validate(item_index, range)` function for each of
1507    /// the ranges specified in the arrow offsets buffer of type
1508    /// `T`. Also validates that each offset is smaller than
1509    /// `offset_limit`
1510    ///
1511    /// For an empty array, the offsets buffer can either be empty
1512    /// or contain a single `0`.
1513    ///
1514    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1515    /// function would call `validate([1,2])`, and `validate([2,4])`
1516    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1517    where
1518        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1519        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1520    {
1521        self.typed_offsets::<T>()?
1522            .iter()
1523            .enumerate()
1524            .map(|(i, x)| {
1525                // check if the offset can be converted to usize
1526                let r = x.to_usize().ok_or_else(|| {
1527                    ArrowError::InvalidArgumentError(format!(
1528                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1529                    );
1530                // check if the offset exceeds the limit
1531                match r {
1532                    Ok(n) if n <= offset_limit => Ok((i, n)),
1533                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1534                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1535                    ),
1536                    Err(e) => Err(e),
1537                }
1538            })
1539            .scan(0_usize, |start, end| {
1540                // check offsets are monotonically increasing
1541                match end {
1542                    Ok((i, end)) if *start <= end => {
1543                        let range = Some(Ok((i, *start..end)));
1544                        *start = end;
1545                        range
1546                    }
1547                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1548                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1549                        i - 1, start, end))
1550                    )),
1551                    Err(err) => Some(Err(err)),
1552                }
1553            })
1554            .skip(1) // the first element is meaningless
1555            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1556                let (item_index, range) = res?;
1557                validate(item_index-1, range)
1558            })
1559    }
1560
1561    /// Ensures that all strings formed by the offsets in `buffers[0]`
1562    /// into `buffers[1]` are valid utf8 sequences
1563    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1564    where
1565        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1566    {
1567        let values_buffer = &self.buffers[1].as_slice();
1568        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1569            // Validate Offsets are correct
1570            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1571                if !values_str.is_char_boundary(range.start)
1572                    || !values_str.is_char_boundary(range.end)
1573                {
1574                    return Err(ArrowError::InvalidArgumentError(format!(
1575                        "incomplete utf-8 byte sequence from index {string_index}"
1576                    )));
1577                }
1578                Ok(())
1579            })
1580        } else {
1581            // find specific offset that failed utf8 validation
1582            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1583                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1584                    ArrowError::InvalidArgumentError(format!(
1585                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1586                    ))
1587                })?;
1588                Ok(())
1589            })
1590        }
1591    }
1592
1593    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1594    /// between `0` and `offset_limit`
1595    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1596    where
1597        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1598    {
1599        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1600            // No validation applied to each value, but the iteration
1601            // itself applies bounds checking to each range
1602            Ok(())
1603        })
1604    }
1605
1606    /// Validates that each value in self.buffers (typed as T)
1607    /// is within the range [0, max_value], inclusive
1608    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1609    where
1610        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1611    {
1612        let required_len = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1613        let buffer = &self.buffers[0];
1614
1615        // This should have been checked as part of `validate()` prior
1616        // to calling `validate_full()` but double check to be sure
1617        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1618
1619        // Justification: buffer size was validated above
1620        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..required_len];
1621
1622        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1623            // Do not check the value is null (value can be arbitrary)
1624            if self.is_null(i) {
1625                return Ok(());
1626            }
1627            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1628                ArrowError::InvalidArgumentError(format!(
1629                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1630                ))
1631            })?;
1632
1633            if dict_index < 0 || dict_index > max_value {
1634                return Err(ArrowError::InvalidArgumentError(format!(
1635                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1636                )));
1637            }
1638            Ok(())
1639        })
1640    }
1641
1642    /// Validates that each value in run_ends array is positive and strictly increasing.
1643    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1644    where
1645        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1646    {
1647        let values = self.typed_buffer::<T>(0, self.len)?;
1648        let mut prev_value: i64 = 0_i64;
1649        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1650            let value: i64 = inp_value.try_into().map_err(|_| {
1651                ArrowError::InvalidArgumentError(format!(
1652                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1653                ))
1654            })?;
1655            if value <= 0_i64 {
1656                return Err(ArrowError::InvalidArgumentError(format!(
1657                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1658                )));
1659            }
1660            if ix > 0 && value <= prev_value {
1661                return Err(ArrowError::InvalidArgumentError(format!(
1662                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1663                )));
1664            }
1665
1666            prev_value = value;
1667            Ok(())
1668        })?;
1669
1670        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1671        if prev_value.as_usize() < len_plus_offset {
1672            return Err(ArrowError::InvalidArgumentError(format!(
1673                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1674                len_plus_offset
1675            )));
1676        }
1677        Ok(())
1678    }
1679
1680    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1681    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1682    /// return false when the arrays are logically equal
1683    pub fn ptr_eq(&self, other: &Self) -> bool {
1684        if self.offset != other.offset
1685            || self.len != other.len
1686            || self.data_type != other.data_type
1687            || self.buffers.len() != other.buffers.len()
1688            || self.child_data.len() != other.child_data.len()
1689        {
1690            return false;
1691        }
1692
1693        match (&self.nulls, &other.nulls) {
1694            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1695            (Some(_), None) | (None, Some(_)) => return false,
1696            _ => {}
1697        };
1698
1699        if !self
1700            .buffers
1701            .iter()
1702            .zip(other.buffers.iter())
1703            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1704        {
1705            return false;
1706        }
1707
1708        self.child_data
1709            .iter()
1710            .zip(other.child_data.iter())
1711            .all(|(a, b)| a.ptr_eq(b))
1712    }
1713
1714    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1715    pub fn into_builder(self) -> ArrayDataBuilder {
1716        self.into()
1717    }
1718
1719    /// Claim memory used by this ArrayData in the provided memory pool.
1720    ///
1721    /// This claims memory for:
1722    /// - All buffers in self.buffers
1723    /// - All child ArrayData recursively
1724    /// - The null buffer if present
1725    #[cfg(feature = "pool")]
1726    pub fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
1727        // Claim all data buffers
1728        for buffer in &self.buffers {
1729            buffer.claim(pool);
1730        }
1731
1732        // Claim null buffer if present
1733        if let Some(nulls) = &self.nulls {
1734            nulls.claim(pool);
1735        }
1736
1737        // Recursively claim child data
1738        for child in &self.child_data {
1739            child.claim(pool);
1740        }
1741    }
1742}
1743
1744/// Return the expected [`DataTypeLayout`] Arrays of this data
1745/// type are expected to have
1746pub fn layout(data_type: &DataType) -> DataTypeLayout {
1747    // based on C/C++ implementation in
1748    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1749    use arrow_schema::IntervalUnit::*;
1750
1751    match data_type {
1752        DataType::Null => DataTypeLayout {
1753            buffers: vec![],
1754            can_contain_null_mask: false,
1755            variadic: false,
1756        },
1757        DataType::Boolean => DataTypeLayout {
1758            buffers: vec![BufferSpec::BitMap],
1759            can_contain_null_mask: true,
1760            variadic: false,
1761        },
1762        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1763        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1764        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1765        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1766        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1767        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1768        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1769        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1770        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1771        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1772        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1773        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1774        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1775        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1776        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1777        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1778        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1779        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1780        DataType::Interval(MonthDayNano) => {
1781            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1782        }
1783        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1784        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1785        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1786        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1787        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1788        DataType::FixedSizeBinary(size) => {
1789            let spec = BufferSpec::FixedWidth {
1790                byte_width: (*size).try_into().unwrap(),
1791                alignment: mem::align_of::<u8>(),
1792            };
1793            DataTypeLayout {
1794                buffers: vec![spec],
1795                can_contain_null_mask: true,
1796                variadic: false,
1797            }
1798        }
1799        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1800        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1801        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1802        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1803        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1804        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1805        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1806        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1807        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1808        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1809        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1810        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1811        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1812        DataType::Union(_, mode) => {
1813            let type_ids = BufferSpec::FixedWidth {
1814                byte_width: mem::size_of::<i8>(),
1815                alignment: mem::align_of::<i8>(),
1816            };
1817
1818            DataTypeLayout {
1819                buffers: match mode {
1820                    UnionMode::Sparse => {
1821                        vec![type_ids]
1822                    }
1823                    UnionMode::Dense => {
1824                        vec![
1825                            type_ids,
1826                            BufferSpec::FixedWidth {
1827                                byte_width: mem::size_of::<i32>(),
1828                                alignment: mem::align_of::<i32>(),
1829                            },
1830                        ]
1831                    }
1832                },
1833                can_contain_null_mask: false,
1834                variadic: false,
1835            }
1836        }
1837        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1838    }
1839}
1840
1841/// Layout specification for a data type
1842#[derive(Debug, PartialEq, Eq)]
1843// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1844pub struct DataTypeLayout {
1845    /// A vector of buffer layout specifications, one for each expected buffer
1846    pub buffers: Vec<BufferSpec>,
1847
1848    /// Can contain a null bitmask
1849    pub can_contain_null_mask: bool,
1850
1851    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1852    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1853    /// buffers.len(). Buffers that exceed the lower bound are legal.
1854    pub variadic: bool,
1855}
1856
1857impl DataTypeLayout {
1858    /// Describes a basic numeric array where each element has type `T`
1859    pub fn new_fixed_width<T>() -> Self {
1860        Self {
1861            buffers: vec![BufferSpec::FixedWidth {
1862                byte_width: mem::size_of::<T>(),
1863                alignment: mem::align_of::<T>(),
1864            }],
1865            can_contain_null_mask: true,
1866            variadic: false,
1867        }
1868    }
1869
1870    /// Describes arrays which have no data of their own
1871    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1872    pub fn new_nullable_empty() -> Self {
1873        Self {
1874            buffers: vec![],
1875            can_contain_null_mask: true,
1876            variadic: false,
1877        }
1878    }
1879
1880    /// Describes arrays which have no data of their own
1881    /// (e.g. RunEndEncoded).
1882    pub fn new_empty() -> Self {
1883        Self {
1884            buffers: vec![],
1885            can_contain_null_mask: false,
1886            variadic: false,
1887        }
1888    }
1889
1890    /// Describes a basic numeric array where each element has a fixed
1891    /// with offset buffer of type `T`, followed by a
1892    /// variable width data buffer
1893    pub fn new_binary<T>() -> Self {
1894        Self {
1895            buffers: vec![
1896                // offsets
1897                BufferSpec::FixedWidth {
1898                    byte_width: mem::size_of::<T>(),
1899                    alignment: mem::align_of::<T>(),
1900                },
1901                // values
1902                BufferSpec::VariableWidth,
1903            ],
1904            can_contain_null_mask: true,
1905            variadic: false,
1906        }
1907    }
1908
1909    /// Describes a view type
1910    pub fn new_view() -> Self {
1911        Self {
1912            buffers: vec![BufferSpec::FixedWidth {
1913                byte_width: mem::size_of::<u128>(),
1914                alignment: mem::align_of::<u128>(),
1915            }],
1916            can_contain_null_mask: true,
1917            variadic: true,
1918        }
1919    }
1920
1921    /// Describes a list view type
1922    pub fn new_list_view<T>() -> Self {
1923        Self {
1924            buffers: vec![
1925                BufferSpec::FixedWidth {
1926                    byte_width: mem::size_of::<T>(),
1927                    alignment: mem::align_of::<T>(),
1928                },
1929                BufferSpec::FixedWidth {
1930                    byte_width: mem::size_of::<T>(),
1931                    alignment: mem::align_of::<T>(),
1932                },
1933            ],
1934            can_contain_null_mask: true,
1935            variadic: false,
1936        }
1937    }
1938}
1939
1940/// Layout specification for a single data type buffer
1941#[derive(Debug, PartialEq, Eq)]
1942pub enum BufferSpec {
1943    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1944    ///
1945    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1946    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1947    ///
1948    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1949    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1950    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1951    ///
1952    /// Note that these alignment requirements will vary between architectures
1953    FixedWidth {
1954        /// The width of each element in bytes
1955        byte_width: usize,
1956        /// The alignment required by Rust for an array of the corresponding primitive
1957        alignment: usize,
1958    },
1959    /// Variable width, such as string data for utf8 data
1960    VariableWidth,
1961    /// Buffer holds a bitmap.
1962    ///
1963    /// Note: Unlike the C++ implementation, the null/validity buffer
1964    /// is handled specially rather than as another of the buffers in
1965    /// the spec, so this variant is only used for the Boolean type.
1966    BitMap,
1967    /// Buffer is always null. Unused currently in Rust implementation,
1968    /// (used in C++ for Union type)
1969    #[allow(dead_code)]
1970    AlwaysNull,
1971}
1972
1973impl PartialEq for ArrayData {
1974    fn eq(&self, other: &Self) -> bool {
1975        equal::equal(self, other)
1976    }
1977}
1978
1979/// A boolean flag that cannot be mutated outside of unsafe code.
1980///
1981/// Defaults to a value of false.
1982///
1983/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1984///
1985/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1986///
1987/// # Example
1988/// ```rust
1989/// use arrow_data::UnsafeFlag;
1990/// assert!(!UnsafeFlag::default().get()); // default is false
1991/// let mut flag = UnsafeFlag::new();
1992/// assert!(!flag.get()); // defaults to false
1993/// // can only set it to true in unsafe code
1994/// unsafe { flag.set(true) };
1995/// assert!(flag.get()); // now true
1996/// ```
1997#[derive(Debug, Clone)]
1998#[doc(hidden)]
1999pub struct UnsafeFlag(bool);
2000
2001impl UnsafeFlag {
2002    /// Creates a new `UnsafeFlag` with the value set to `false`.
2003    ///
2004    /// See examples on [`Self::new`]
2005    #[inline]
2006    pub const fn new() -> Self {
2007        Self(false)
2008    }
2009
2010    /// Sets the value of the flag to the given value
2011    ///
2012    /// Note this can purposely only be done in `unsafe` code
2013    ///
2014    /// # Safety
2015    ///
2016    /// If set, the flag will be set to the given value. There is nothing
2017    /// immediately unsafe about doing so, however, the flag can be used to
2018    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
2019    #[inline]
2020    pub unsafe fn set(&mut self, val: bool) {
2021        self.0 = val;
2022    }
2023
2024    /// Returns the value of the flag
2025    #[inline]
2026    pub fn get(&self) -> bool {
2027        self.0
2028    }
2029}
2030
2031// Manual impl to make it clear you can not construct unsafe with true
2032impl Default for UnsafeFlag {
2033    fn default() -> Self {
2034        Self::new()
2035    }
2036}
2037
2038/// Builder for [`ArrayData`] type
2039#[derive(Debug)]
2040pub struct ArrayDataBuilder {
2041    data_type: DataType,
2042    len: usize,
2043    null_count: Option<usize>,
2044    null_bit_buffer: Option<Buffer>,
2045    nulls: Option<NullBuffer>,
2046    offset: usize,
2047    buffers: Vec<Buffer>,
2048    child_data: Vec<ArrayData>,
2049    /// Should buffers be realigned (copying if necessary)?
2050    ///
2051    /// Defaults to false.
2052    align_buffers: bool,
2053    /// Should data validation be skipped for this [`ArrayData`]?
2054    ///
2055    /// Defaults to false.
2056    ///
2057    /// # Safety
2058    ///
2059    /// This flag can only be set to true using `unsafe` APIs. However, once true
2060    /// subsequent calls to `build()` may result in undefined behavior if the data
2061    /// is not valid.
2062    skip_validation: UnsafeFlag,
2063}
2064
2065impl ArrayDataBuilder {
2066    #[inline]
2067    /// Creates a new array data builder
2068    pub const fn new(data_type: DataType) -> Self {
2069        Self {
2070            data_type,
2071            len: 0,
2072            null_count: None,
2073            null_bit_buffer: None,
2074            nulls: None,
2075            offset: 0,
2076            buffers: vec![],
2077            child_data: vec![],
2078            align_buffers: false,
2079            skip_validation: UnsafeFlag::new(),
2080        }
2081    }
2082
2083    /// Creates a new array data builder from an existing one, changing the data type
2084    pub fn data_type(self, data_type: DataType) -> Self {
2085        Self { data_type, ..self }
2086    }
2087
2088    #[inline]
2089    #[allow(clippy::len_without_is_empty)]
2090    /// Sets the length of the [ArrayData]
2091    pub const fn len(mut self, n: usize) -> Self {
2092        self.len = n;
2093        self
2094    }
2095
2096    /// Sets the null buffer of the [ArrayData]
2097    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2098        self.nulls = nulls;
2099        self.null_count = None;
2100        self.null_bit_buffer = None;
2101        self
2102    }
2103
2104    /// Sets the null count of the [ArrayData]
2105    pub fn null_count(mut self, null_count: usize) -> Self {
2106        self.null_count = Some(null_count);
2107        self
2108    }
2109
2110    /// Sets the `null_bit_buffer` of the [ArrayData]
2111    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2112        self.nulls = None;
2113        self.null_bit_buffer = buf;
2114        self
2115    }
2116
2117    /// Sets the offset of the [ArrayData]
2118    #[inline]
2119    pub const fn offset(mut self, n: usize) -> Self {
2120        self.offset = n;
2121        self
2122    }
2123
2124    /// Sets the buffers of the [ArrayData]
2125    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2126        self.buffers = v;
2127        self
2128    }
2129
2130    /// Adds a single buffer to the [ArrayData]'s buffers
2131    pub fn add_buffer(mut self, b: Buffer) -> Self {
2132        self.buffers.push(b);
2133        self
2134    }
2135
2136    /// Adds multiple buffers to the [ArrayData]'s buffers
2137    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2138        self.buffers.extend(bs);
2139        self
2140    }
2141
2142    /// Sets the child data of the [ArrayData]
2143    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2144        self.child_data = v;
2145        self
2146    }
2147
2148    /// Adds a single child data to the [ArrayData]'s child data
2149    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2150        self.child_data.push(r);
2151        self
2152    }
2153
2154    /// Creates an array data, without any validation
2155    ///
2156    /// Note: This is shorthand for
2157    /// ```rust
2158    /// # #[expect(unsafe_op_in_unsafe_fn)]
2159    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2160    /// # let _ = unsafe {
2161    /// builder.skip_validation(true).build().unwrap()
2162    /// # };
2163    /// ```
2164    ///
2165    /// # Safety
2166    ///
2167    /// The same caveats as [`ArrayData::new_unchecked`]
2168    /// apply.
2169    pub unsafe fn build_unchecked(self) -> ArrayData {
2170        unsafe { self.skip_validation(true) }.build().unwrap()
2171    }
2172
2173    /// Creates an `ArrayData`, consuming `self`
2174    ///
2175    /// # Safety
2176    ///
2177    /// By default the underlying buffers are checked to ensure they are valid
2178    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2179    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2180    /// not valid, undefined behavior will result.
2181    pub fn build(self) -> Result<ArrayData, ArrowError> {
2182        let Self {
2183            data_type,
2184            len,
2185            null_count,
2186            null_bit_buffer,
2187            nulls,
2188            offset,
2189            buffers,
2190            child_data,
2191            align_buffers,
2192            skip_validation,
2193        } = self;
2194
2195        let nulls = nulls
2196            .or_else(|| {
2197                let buffer = null_bit_buffer?;
2198                let buffer = BooleanBuffer::new(buffer, offset, len);
2199                Some(match null_count {
2200                    Some(n) => {
2201                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2202                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2203                    }
2204                    None => NullBuffer::new(buffer),
2205                })
2206            })
2207            .filter(|b| b.null_count() != 0);
2208
2209        let mut data = ArrayData {
2210            data_type,
2211            len,
2212            offset,
2213            buffers,
2214            child_data,
2215            nulls,
2216        };
2217
2218        if align_buffers {
2219            data.align_buffers();
2220        }
2221
2222        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2223        if !skip_validation.get() || cfg!(feature = "force_validate") {
2224            data.validate_data()?;
2225        }
2226        Ok(data)
2227    }
2228
2229    /// Ensure that all buffers are aligned, copying data if necessary
2230    ///
2231    /// Rust requires that arrays are aligned to their corresponding primitive,
2232    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2233    ///
2234    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2235    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2236    ///
2237    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2238    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2239    /// when necessary, making it useful when interacting with buffers produced by other systems,
2240    /// e.g. IPC or FFI.
2241    ///
2242    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2243    /// insufficiently aligned buffers.
2244    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2245        self.align_buffers = align_buffers;
2246        self
2247    }
2248
2249    /// Skips validation of the data.
2250    ///
2251    /// If this flag is enabled, `[Self::build`] will skip validation of the
2252    /// data
2253    ///
2254    /// If this flag is not enabled, `[Self::build`] will validate that all
2255    /// buffers are valid and will return an error if any data is invalid.
2256    /// Validation can be expensive.
2257    ///
2258    /// # Safety
2259    ///
2260    /// If validation is skipped, the buffers must form a valid Arrow array,
2261    /// otherwise undefined behavior will result
2262    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2263        unsafe {
2264            self.skip_validation.set(skip_validation);
2265        }
2266        self
2267    }
2268}
2269
2270impl From<ArrayData> for ArrayDataBuilder {
2271    fn from(d: ArrayData) -> Self {
2272        Self {
2273            data_type: d.data_type,
2274            len: d.len,
2275            offset: d.offset,
2276            buffers: d.buffers,
2277            child_data: d.child_data,
2278            nulls: d.nulls,
2279            null_bit_buffer: None,
2280            null_count: None,
2281            align_buffers: false,
2282            skip_validation: UnsafeFlag::new(),
2283        }
2284    }
2285}
2286
2287/// Get byte width of FixedSizeBinary size
2288/// # Panics:
2289/// - Panics if the `data_type` is not FixedSizeBinary
2290/// - Panics if byte width is negative
2291pub(crate) fn get_fixed_size_binary_width(data_type: &DataType) -> usize {
2292    match data_type {
2293        DataType::FixedSizeBinary(i) => {
2294            if *i < 0 {
2295                panic!("cannot compare FixedSizeBinary({})", *i);
2296            }
2297            *i as usize
2298        }
2299        _ => unreachable!(),
2300    }
2301}
2302
2303#[cfg(test)]
2304mod tests {
2305    use super::*;
2306    use arrow_schema::{Field, Fields};
2307
2308    // See arrow/tests/array_data_validation.rs for test of array validation
2309
2310    /// returns a buffer initialized with some constant value for tests
2311    fn make_i32_buffer(n: usize) -> Buffer {
2312        Buffer::from_slice_ref(vec![42i32; n])
2313    }
2314
2315    /// returns a buffer initialized with some constant value for tests
2316    fn make_f32_buffer(n: usize) -> Buffer {
2317        Buffer::from_slice_ref(vec![42f32; n])
2318    }
2319
2320    #[test]
2321    fn test_builder() {
2322        // Buffer needs to be at least 25 long
2323        let v = (0..25).collect::<Vec<i32>>();
2324        let b1 = Buffer::from_slice_ref(&v);
2325        let arr_data = ArrayData::builder(DataType::Int32)
2326            .len(20)
2327            .offset(5)
2328            .add_buffer(b1)
2329            .null_bit_buffer(Some(Buffer::from([
2330                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2331            ])))
2332            .build()
2333            .unwrap();
2334
2335        assert_eq!(20, arr_data.len());
2336        assert_eq!(10, arr_data.null_count());
2337        assert_eq!(5, arr_data.offset());
2338        assert_eq!(1, arr_data.buffers().len());
2339        assert_eq!(
2340            Buffer::from_slice_ref(&v).as_slice(),
2341            arr_data.buffers()[0].as_slice()
2342        );
2343    }
2344
2345    #[test]
2346    fn test_builder_with_child_data() {
2347        let child_arr_data = ArrayData::try_new(
2348            DataType::Int32,
2349            5,
2350            None,
2351            0,
2352            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2353            vec![],
2354        )
2355        .unwrap();
2356
2357        let field = Arc::new(Field::new("x", DataType::Int32, true));
2358        let data_type = DataType::Struct(vec![field].into());
2359
2360        let arr_data = ArrayData::builder(data_type)
2361            .len(5)
2362            .offset(0)
2363            .add_child_data(child_arr_data.clone())
2364            .build()
2365            .unwrap();
2366
2367        assert_eq!(5, arr_data.len());
2368        assert_eq!(1, arr_data.child_data().len());
2369        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2370    }
2371
2372    #[test]
2373    fn test_null_count() {
2374        let mut bit_v: [u8; 2] = [0; 2];
2375        bit_util::set_bit(&mut bit_v, 0);
2376        bit_util::set_bit(&mut bit_v, 3);
2377        bit_util::set_bit(&mut bit_v, 10);
2378        let arr_data = ArrayData::builder(DataType::Int32)
2379            .len(16)
2380            .add_buffer(make_i32_buffer(16))
2381            .null_bit_buffer(Some(Buffer::from(bit_v)))
2382            .build()
2383            .unwrap();
2384        assert_eq!(13, arr_data.null_count());
2385
2386        // Test with offset
2387        let mut bit_v: [u8; 2] = [0; 2];
2388        bit_util::set_bit(&mut bit_v, 0);
2389        bit_util::set_bit(&mut bit_v, 3);
2390        bit_util::set_bit(&mut bit_v, 10);
2391        let arr_data = ArrayData::builder(DataType::Int32)
2392            .len(12)
2393            .offset(2)
2394            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2395            .null_bit_buffer(Some(Buffer::from(bit_v)))
2396            .build()
2397            .unwrap();
2398        assert_eq!(10, arr_data.null_count());
2399    }
2400
2401    #[test]
2402    fn test_null_buffer_ref() {
2403        let mut bit_v: [u8; 2] = [0; 2];
2404        bit_util::set_bit(&mut bit_v, 0);
2405        bit_util::set_bit(&mut bit_v, 3);
2406        bit_util::set_bit(&mut bit_v, 10);
2407        let arr_data = ArrayData::builder(DataType::Int32)
2408            .len(16)
2409            .add_buffer(make_i32_buffer(16))
2410            .null_bit_buffer(Some(Buffer::from(bit_v)))
2411            .build()
2412            .unwrap();
2413        assert!(arr_data.nulls().is_some());
2414        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2415    }
2416
2417    #[test]
2418    fn test_slice() {
2419        let mut bit_v: [u8; 2] = [0; 2];
2420        bit_util::set_bit(&mut bit_v, 0);
2421        bit_util::set_bit(&mut bit_v, 3);
2422        bit_util::set_bit(&mut bit_v, 10);
2423        let data = ArrayData::builder(DataType::Int32)
2424            .len(16)
2425            .add_buffer(make_i32_buffer(16))
2426            .null_bit_buffer(Some(Buffer::from(bit_v)))
2427            .build()
2428            .unwrap();
2429        let new_data = data.slice(1, 15);
2430        assert_eq!(data.len() - 1, new_data.len());
2431        assert_eq!(1, new_data.offset());
2432        assert_eq!(data.null_count(), new_data.null_count());
2433
2434        // slice of a slice (removes one null)
2435        let new_data = new_data.slice(1, 14);
2436        assert_eq!(data.len() - 2, new_data.len());
2437        assert_eq!(2, new_data.offset());
2438        assert_eq!(data.null_count() - 1, new_data.null_count());
2439    }
2440
2441    #[test]
2442    #[should_panic(expected = "offset + length overflow")]
2443    fn test_slice_panics_on_offset_length_overflow() {
2444        let data = ArrayData::builder(DataType::Int32)
2445            .len(4)
2446            .add_buffer(make_i32_buffer(4))
2447            .build()
2448            .unwrap();
2449        let sliced = data.slice(1, 3);
2450
2451        sliced.slice(1, usize::MAX);
2452    }
2453
2454    #[test]
2455    fn test_typed_offsets_length_overflow() {
2456        let data = ArrayData {
2457            data_type: DataType::Binary,
2458            len: usize::MAX,
2459            offset: 0,
2460            buffers: vec![Buffer::from_slice_ref([0_i32])],
2461            child_data: vec![],
2462            nulls: None,
2463        };
2464        let err = data.typed_offsets::<i32>().unwrap_err();
2465
2466        assert_eq!(
2467            err.to_string(),
2468            format!(
2469                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2470                usize::MAX
2471            )
2472        );
2473    }
2474
2475    #[test]
2476    fn test_validate_typed_buffer_length_overflow() {
2477        let data = ArrayData {
2478            data_type: DataType::Binary,
2479            len: 0,
2480            offset: 2,
2481            buffers: vec![Buffer::from_slice_ref([0_i32])],
2482            child_data: vec![],
2483            nulls: None,
2484        };
2485        let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
2486
2487        assert_eq!(
2488            err.to_string(),
2489            format!(
2490                "Invalid argument error: Length {} with offset 2 overflows usize for Binary",
2491                usize::MAX
2492            )
2493        );
2494    }
2495
2496    // Exercises ArrayData::try_new with len + offset overflowing
2497    fn try_new_binary_length_offset_overflow() -> Result<ArrayData, ArrowError> {
2498        ArrayData::try_new(
2499            DataType::Binary,
2500            usize::MAX,
2501            None,
2502            1,
2503            vec![
2504                Buffer::from_slice_ref([0_i32]),
2505                Buffer::from_iter(std::iter::empty::<u8>()),
2506            ],
2507            vec![],
2508        )
2509    }
2510
2511    #[cfg(not(feature = "force_validate"))]
2512    #[test]
2513    fn test_try_new_length_offset_overflow() {
2514        let err = try_new_binary_length_offset_overflow().unwrap_err();
2515
2516        assert_eq!(
2517            err.to_string(),
2518            format!(
2519                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2520                usize::MAX
2521            )
2522        );
2523    }
2524
2525    #[cfg(feature = "force_validate")]
2526    #[test]
2527    #[should_panic(
2528        expected = "Length 18446744073709551615 with offset 1 overflows usize for Binary"
2529    )]
2530    fn test_try_new_length_offset_overflow_force_validate() {
2531        try_new_binary_length_offset_overflow().unwrap();
2532    }
2533
2534    #[test]
2535    fn test_equality() {
2536        let int_data = ArrayData::builder(DataType::Int32)
2537            .len(1)
2538            .add_buffer(make_i32_buffer(1))
2539            .build()
2540            .unwrap();
2541
2542        let float_data = ArrayData::builder(DataType::Float32)
2543            .len(1)
2544            .add_buffer(make_f32_buffer(1))
2545            .build()
2546            .unwrap();
2547        assert_ne!(int_data, float_data);
2548        assert!(!int_data.ptr_eq(&float_data));
2549        assert!(int_data.ptr_eq(&int_data));
2550
2551        #[allow(clippy::redundant_clone)]
2552        let int_data_clone = int_data.clone();
2553        assert_eq!(int_data, int_data_clone);
2554        assert!(int_data.ptr_eq(&int_data_clone));
2555        assert!(int_data_clone.ptr_eq(&int_data));
2556
2557        let int_data_slice = int_data_clone.slice(1, 0);
2558        assert!(int_data_slice.ptr_eq(&int_data_slice));
2559        assert!(!int_data.ptr_eq(&int_data_slice));
2560        assert!(!int_data_slice.ptr_eq(&int_data));
2561
2562        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2563        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2564        let string_data = ArrayData::try_new(
2565            DataType::Utf8,
2566            3,
2567            Some(Buffer::from_iter(vec![true, false, true])),
2568            0,
2569            vec![offsets_buffer, data_buffer],
2570            vec![],
2571        )
2572        .unwrap();
2573
2574        assert_ne!(float_data, string_data);
2575        assert!(!float_data.ptr_eq(&string_data));
2576
2577        assert!(string_data.ptr_eq(&string_data));
2578
2579        #[allow(clippy::redundant_clone)]
2580        let string_data_cloned = string_data.clone();
2581        assert!(string_data_cloned.ptr_eq(&string_data));
2582        assert!(string_data.ptr_eq(&string_data_cloned));
2583
2584        let string_data_slice = string_data.slice(1, 2);
2585        assert!(string_data_slice.ptr_eq(&string_data_slice));
2586        assert!(!string_data_slice.ptr_eq(&string_data))
2587    }
2588
2589    #[test]
2590    fn test_slice_memory_size() {
2591        let mut bit_v: [u8; 2] = [0; 2];
2592        bit_util::set_bit(&mut bit_v, 0);
2593        bit_util::set_bit(&mut bit_v, 3);
2594        bit_util::set_bit(&mut bit_v, 10);
2595        let data = ArrayData::builder(DataType::Int32)
2596            .len(16)
2597            .add_buffer(make_i32_buffer(16))
2598            .null_bit_buffer(Some(Buffer::from(bit_v)))
2599            .build()
2600            .unwrap();
2601        let new_data = data.slice(1, 14);
2602        assert_eq!(
2603            data.get_slice_memory_size().unwrap() - 8,
2604            new_data.get_slice_memory_size().unwrap()
2605        );
2606        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2607        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2608        let string_data = ArrayData::try_new(
2609            DataType::Utf8,
2610            3,
2611            Some(Buffer::from_iter(vec![true, false, true])),
2612            0,
2613            vec![offsets_buffer, data_buffer],
2614            vec![],
2615        )
2616        .unwrap();
2617        let string_data_slice = string_data.slice(1, 2);
2618        //4 bytes of offset and 2 bytes of data reduced by slicing.
2619        assert_eq!(
2620            string_data.get_slice_memory_size().unwrap() - 6,
2621            string_data_slice.get_slice_memory_size().unwrap()
2622        );
2623    }
2624
2625    #[test]
2626    fn test_count_nulls() {
2627        let buffer = Buffer::from([0b00010110, 0b10011111]);
2628        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2629        let count = count_nulls(Some(&buffer), 0, 16);
2630        assert_eq!(count, 7);
2631
2632        let count = count_nulls(Some(&buffer), 4, 8);
2633        assert_eq!(count, 3);
2634    }
2635
2636    #[test]
2637    fn test_contains_nulls() {
2638        let buffer: Buffer =
2639            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2640        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2641        assert!(contains_nulls(Some(&buffer), 0, 6));
2642        assert!(contains_nulls(Some(&buffer), 0, 3));
2643        assert!(!contains_nulls(Some(&buffer), 3, 2));
2644        assert!(!contains_nulls(Some(&buffer), 0, 0));
2645    }
2646
2647    #[test]
2648    fn test_alignment() {
2649        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2650        let sliced = buffer.slice(1);
2651
2652        let mut data = ArrayData {
2653            data_type: DataType::Int32,
2654            len: 0,
2655            offset: 0,
2656            buffers: vec![buffer],
2657            child_data: vec![],
2658            nulls: None,
2659        };
2660        data.validate_full().unwrap();
2661
2662        // break alignment in data
2663        data.buffers[0] = sliced;
2664        let err = data.validate().unwrap_err();
2665
2666        assert_eq!(
2667            err.to_string(),
2668            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2669        );
2670
2671        data.align_buffers();
2672        data.validate_full().unwrap();
2673    }
2674
2675    #[test]
2676    fn test_alignment_struct() {
2677        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2678        let sliced = buffer.slice(1);
2679
2680        let child_data = ArrayData {
2681            data_type: DataType::Int32,
2682            len: 0,
2683            offset: 0,
2684            buffers: vec![buffer],
2685            child_data: vec![],
2686            nulls: None,
2687        };
2688
2689        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2690        let mut data = ArrayData {
2691            data_type: schema,
2692            len: 0,
2693            offset: 0,
2694            buffers: vec![],
2695            child_data: vec![child_data],
2696            nulls: None,
2697        };
2698        data.validate_full().unwrap();
2699
2700        // break alignment in child data
2701        data.child_data[0].buffers[0] = sliced;
2702        let err = data.validate().unwrap_err();
2703
2704        assert_eq!(
2705            err.to_string(),
2706            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2707        );
2708
2709        data.align_buffers();
2710        data.validate_full().unwrap();
2711    }
2712
2713    #[test]
2714    fn test_null_view_types() {
2715        let array_len = 32;
2716        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2717        assert_eq!(array.len(), array_len);
2718        for i in 0..array.len() {
2719            assert!(array.is_null(i));
2720        }
2721
2722        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2723        assert_eq!(array.len(), array_len);
2724        for i in 0..array.len() {
2725            assert!(array.is_null(i));
2726        }
2727
2728        let array = ArrayData::new_null(
2729            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2730            array_len,
2731        );
2732        assert_eq!(array.len(), array_len);
2733        for i in 0..array.len() {
2734            assert!(array.is_null(i));
2735        }
2736
2737        let array = ArrayData::new_null(
2738            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2739            array_len,
2740        );
2741        assert_eq!(array.len(), array_len);
2742        for i in 0..array.len() {
2743            assert!(array.is_null(i));
2744        }
2745    }
2746
2747    // Even when `force_validate` feature is on
2748    #[test]
2749    fn test_dont_panic_on_bad_input_when_using_try_new() {
2750        let empty_bytes = Buffer::default();
2751
2752        let array_data = ArrayData::try_new(
2753            DataType::Utf8,
2754            1, // len
2755            None,
2756            0,
2757            // the offsets says that we have 2 bytes but the buffer is empty
2758            vec![Buffer::from_vec(vec![0i32, 2i32]), empty_bytes],
2759            vec![],
2760        );
2761
2762        let res = array_data.expect_err("should get error");
2763
2764        assert_eq!(
2765            res.to_string(),
2766            format!("Invalid argument error: Last offset 2 of Utf8 is larger than values length 0",)
2767        );
2768    }
2769}