Skip to main content

arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257    /// Create a new ArrayData instance;
258    ///
259    /// If `null_count` is not specified, the number of nulls in
260    /// null_bit_buffer is calculated.
261    ///
262    /// If the number of nulls is 0 then the null_bit_buffer
263    /// is set to `None`.
264    ///
265    /// # Safety
266    ///
267    /// The input values *must* form a valid Arrow array for
268    /// `data_type`, or undefined behavior can result.
269    ///
270    /// Note: This is a low level API and most users of the arrow
271    /// crate should create arrays using the methods in the `array`
272    /// module.
273    pub unsafe fn new_unchecked(
274        data_type: DataType,
275        len: usize,
276        null_count: Option<usize>,
277        null_bit_buffer: Option<Buffer>,
278        offset: usize,
279        buffers: Vec<Buffer>,
280        child_data: Vec<ArrayData>,
281    ) -> Self {
282        let mut skip_validation = UnsafeFlag::new();
283        // SAFETY: caller responsible for ensuring data is valid
284        unsafe { skip_validation.set(true) };
285
286        ArrayDataBuilder {
287            data_type,
288            len,
289            null_count,
290            null_bit_buffer,
291            nulls: None,
292            offset,
293            buffers,
294            child_data,
295            align_buffers: false,
296            skip_validation,
297        }
298        .build()
299        .unwrap()
300    }
301
302    /// Create a new ArrayData, validating that the provided buffers form a valid
303    /// Arrow array of the specified data type.
304    ///
305    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306    /// is set to `None`.
307    ///
308    /// Internally this calls through to [`Self::validate_data`]
309    ///
310    /// Note: This is a low level API and most users of the arrow crate should create
311    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312    /// or [`ArrayDataBuilder`].
313    ///
314    /// See also [`Self::into_parts`] to recover the fields
315    pub fn try_new(
316        data_type: DataType,
317        len: usize,
318        null_bit_buffer: Option<Buffer>,
319        offset: usize,
320        buffers: Vec<Buffer>,
321        child_data: Vec<ArrayData>,
322    ) -> Result<Self, ArrowError> {
323        // we must check the length of `null_bit_buffer` first
324        // because we use this buffer to calculate `null_count`
325        // in `Self::new_unchecked`.
326        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
327            let needed_len = bit_util::ceil(len + offset, 8);
328            if null_bit_buffer.len() < needed_len {
329                return Err(ArrowError::InvalidArgumentError(format!(
330                    "null_bit_buffer size too small. got {} needed {}",
331                    null_bit_buffer.len(),
332                    needed_len
333                )));
334            }
335        }
336        // Safety justification: `validate_full` is called below
337        let new_self = unsafe {
338            Self::new_unchecked(
339                data_type,
340                len,
341                None,
342                null_bit_buffer,
343                offset,
344                buffers,
345                child_data,
346            )
347        };
348
349        // As the data is not trusted, do a full validation of its contents
350        // We don't need to validate children as we can assume that the
351        // [`ArrayData`] in `child_data` have already been validated through
352        // a call to `ArrayData::try_new` or created using unsafe
353        new_self.validate_data()?;
354        Ok(new_self)
355    }
356
357    /// Return the constituent parts of this ArrayData
358    ///
359    /// This is the inverse of [`ArrayData::try_new`].
360    ///
361    /// Returns `(data_type, len, nulls, offset, buffers, child_data)`
362    pub fn into_parts(
363        self,
364    ) -> (
365        DataType,
366        usize,
367        Option<NullBuffer>,
368        usize,
369        Vec<Buffer>,
370        Vec<ArrayData>,
371    ) {
372        let Self {
373            data_type,
374            len,
375            nulls,
376            offset,
377            buffers,
378            child_data,
379        } = self;
380
381        (data_type, len, nulls, offset, buffers, child_data)
382    }
383
384    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
385    #[inline]
386    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
387        ArrayDataBuilder::new(data_type)
388    }
389
390    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
391    #[inline]
392    pub const fn data_type(&self) -> &DataType {
393        &self.data_type
394    }
395
396    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
397    pub fn buffers(&self) -> &[Buffer] {
398        &self.buffers
399    }
400
401    /// Returns a slice of children [`ArrayData`]. This will be non
402    /// empty for type such as lists and structs.
403    pub fn child_data(&self) -> &[ArrayData] {
404        &self.child_data[..]
405    }
406
407    /// Returns whether the element at index `i` is null
408    #[inline]
409    pub fn is_null(&self, i: usize) -> bool {
410        match &self.nulls {
411            Some(v) => v.is_null(i),
412            None => false,
413        }
414    }
415
416    /// Returns a reference to the null buffer of this [`ArrayData`] if any
417    ///
418    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
419    #[inline]
420    pub fn nulls(&self) -> Option<&NullBuffer> {
421        self.nulls.as_ref()
422    }
423
424    /// Returns whether the element at index `i` is not null
425    #[inline]
426    pub fn is_valid(&self, i: usize) -> bool {
427        !self.is_null(i)
428    }
429
430    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
431    #[inline]
432    pub const fn len(&self) -> usize {
433        self.len
434    }
435
436    /// Returns whether this [`ArrayData`] is empty
437    #[inline]
438    pub const fn is_empty(&self) -> bool {
439        self.len == 0
440    }
441
442    /// Returns the offset of this [`ArrayData`]
443    #[inline]
444    pub const fn offset(&self) -> usize {
445        self.offset
446    }
447
448    /// Returns the total number of nulls in this array
449    #[inline]
450    pub fn null_count(&self) -> usize {
451        self.nulls
452            .as_ref()
453            .map(|x| x.null_count())
454            .unwrap_or_default()
455    }
456
457    /// Returns the total number of bytes of memory occupied by the
458    /// buffers owned by this [`ArrayData`] and all of its
459    /// children. (See also diagram on [`ArrayData`]).
460    ///
461    /// Note that this [`ArrayData`] may only refer to a subset of the
462    /// data in the underlying [`Buffer`]s (due to `offset` and
463    /// `length`), but the size returned includes the entire size of
464    /// the buffers.
465    ///
466    /// If multiple [`ArrayData`]s refer to the same underlying
467    /// [`Buffer`]s they will both report the same size.
468    pub fn get_buffer_memory_size(&self) -> usize {
469        let mut size = 0;
470        for buffer in &self.buffers {
471            size += buffer.capacity();
472        }
473        if let Some(bitmap) = &self.nulls {
474            size += bitmap.buffer().capacity()
475        }
476        for child in &self.child_data {
477            size += child.get_buffer_memory_size();
478        }
479        size
480    }
481
482    /// Returns the total number of the bytes of memory occupied by
483    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
484    ///
485    /// This is approximately the number of bytes if a new
486    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
487    /// exactly the data needed.
488    ///
489    /// For example, a [`DataType::Int64`] with `100` elements,
490    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
491    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
492    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
493    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
494    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
495        let mut result: usize = 0;
496        let layout = layout(&self.data_type);
497
498        for spec in layout.buffers.iter() {
499            match spec {
500                BufferSpec::FixedWidth { byte_width, .. } => {
501                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
502                        ArrowError::ComputeError(
503                            "Integer overflow computing buffer size".to_string(),
504                        )
505                    })?;
506                    result += buffer_size;
507                }
508                BufferSpec::VariableWidth => {
509                    let buffer_len = match self.data_type {
510                        DataType::Utf8 | DataType::Binary => {
511                            let offsets = self.typed_offsets::<i32>()?;
512                            (offsets[self.len] - offsets[0]) as usize
513                        }
514                        DataType::LargeUtf8 | DataType::LargeBinary => {
515                            let offsets = self.typed_offsets::<i64>()?;
516                            (offsets[self.len] - offsets[0]) as usize
517                        }
518                        _ => {
519                            return Err(ArrowError::NotYetImplemented(format!(
520                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
521                                self.data_type
522                            )));
523                        }
524                    };
525                    result += buffer_len;
526                }
527                BufferSpec::BitMap => {
528                    let buffer_size = bit_util::ceil(self.len, 8);
529                    result += buffer_size;
530                }
531                BufferSpec::AlwaysNull => {
532                    // Nothing to do
533                }
534            }
535        }
536
537        if self.nulls().is_some() {
538            result += bit_util::ceil(self.len, 8);
539        }
540
541        for child in &self.child_data {
542            result += child.get_slice_memory_size()?;
543        }
544        Ok(result)
545    }
546
547    /// Returns the total number of bytes of memory occupied
548    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
549    /// children. (See also diagram on [`ArrayData`]).
550    ///
551    /// Equivalent to:
552    ///  `size_of_val(self)` +
553    ///  [`Self::get_buffer_memory_size`] +
554    ///  `size_of_val(child)` for all children
555    pub fn get_array_memory_size(&self) -> usize {
556        let mut size = mem::size_of_val(self);
557
558        // Calculate rest of the fields top down which contain actual data
559        for buffer in &self.buffers {
560            size += mem::size_of::<Buffer>();
561            size += buffer.capacity();
562        }
563        if let Some(nulls) = &self.nulls {
564            size += nulls.buffer().capacity();
565        }
566        for child in &self.child_data {
567            size += child.get_array_memory_size();
568        }
569
570        size
571    }
572
573    /// Creates a zero-copy slice of itself. This creates a new
574    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
575    /// different offset and len
576    ///
577    /// # Panics
578    ///
579    /// Panics if `offset + length > self.len()`.
580    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
581        assert!((offset + length) <= self.len());
582
583        if let DataType::Struct(_) = self.data_type() {
584            // Slice into children
585            let new_offset = self.offset + offset;
586            ArrayData {
587                data_type: self.data_type().clone(),
588                len: length,
589                offset: new_offset,
590                buffers: self.buffers.clone(),
591                // Slice child data, to propagate offsets down to them
592                child_data: self
593                    .child_data()
594                    .iter()
595                    .map(|data| data.slice(offset, length))
596                    .collect(),
597                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
598            }
599        } else {
600            let mut new_data = self.clone();
601
602            new_data.len = length;
603            new_data.offset = offset + self.offset;
604            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
605
606            new_data
607        }
608    }
609
610    /// Returns the `buffer` as a slice of type `T` starting at self.offset
611    ///
612    /// # Panics
613    /// This function panics if:
614    /// * the buffer is not byte-aligned with type T, or
615    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
616    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
617        &self.buffers()[buffer].typed_data()[self.offset..]
618    }
619
620    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
621    pub fn new_null(data_type: &DataType, len: usize) -> Self {
622        let bit_len = bit_util::ceil(len, 8);
623        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
624
625        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
626            Some(width) => (vec![zeroed(width * len)], vec![], true),
627            None => match data_type {
628                DataType::Null => (vec![], vec![], false),
629                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
630                DataType::Binary | DataType::Utf8 => {
631                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
632                }
633                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
634                DataType::LargeBinary | DataType::LargeUtf8 => {
635                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
636                }
637                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
638                DataType::List(f) | DataType::Map(f, _) => (
639                    vec![zeroed((len + 1) * 4)],
640                    vec![ArrayData::new_empty(f.data_type())],
641                    true,
642                ),
643                DataType::LargeList(f) => (
644                    vec![zeroed((len + 1) * 8)],
645                    vec![ArrayData::new_empty(f.data_type())],
646                    true,
647                ),
648                DataType::ListView(f) => (
649                    vec![zeroed(len * 4), zeroed(len * 4)],
650                    vec![ArrayData::new_empty(f.data_type())],
651                    true,
652                ),
653                DataType::LargeListView(f) => (
654                    vec![zeroed(len * 8), zeroed(len * 8)],
655                    vec![ArrayData::new_empty(f.data_type())],
656                    true,
657                ),
658                DataType::FixedSizeList(f, list_len) => (
659                    vec![],
660                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
661                    true,
662                ),
663                DataType::Struct(fields) => (
664                    vec![],
665                    fields
666                        .iter()
667                        .map(|f| Self::new_null(f.data_type(), len))
668                        .collect(),
669                    true,
670                ),
671                DataType::Dictionary(k, v) => (
672                    vec![zeroed(k.primitive_width().unwrap() * len)],
673                    vec![ArrayData::new_empty(v.as_ref())],
674                    true,
675                ),
676                DataType::Union(f, mode) => {
677                    let (id, _) = f.iter().next().unwrap();
678                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
679                    let buffers = match mode {
680                        UnionMode::Sparse => vec![ids],
681                        UnionMode::Dense => {
682                            let end_offset = i32::from_usize(len).unwrap();
683                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
684                        }
685                    };
686
687                    let children = f
688                        .iter()
689                        .enumerate()
690                        .map(|(idx, (_, f))| {
691                            if idx == 0 || *mode == UnionMode::Sparse {
692                                Self::new_null(f.data_type(), len)
693                            } else {
694                                Self::new_empty(f.data_type())
695                            }
696                        })
697                        .collect();
698
699                    (buffers, children, false)
700                }
701                DataType::RunEndEncoded(r, v) => {
702                    if len == 0 {
703                        // For empty arrays, create zero-length child arrays.
704                        let runs = ArrayData::new_empty(r.data_type());
705                        let values = ArrayData::new_empty(v.data_type());
706                        (vec![], vec![runs, values], false)
707                    } else {
708                        let runs = match r.data_type() {
709                            DataType::Int16 => {
710                                let i = i16::from_usize(len).expect("run overflow");
711                                Buffer::from_slice_ref([i])
712                            }
713                            DataType::Int32 => {
714                                let i = i32::from_usize(len).expect("run overflow");
715                                Buffer::from_slice_ref([i])
716                            }
717                            DataType::Int64 => {
718                                let i = i64::from_usize(len).expect("run overflow");
719                                Buffer::from_slice_ref([i])
720                            }
721                            dt => unreachable!("Invalid run ends data type {dt}"),
722                        };
723
724                        let builder = ArrayData::builder(r.data_type().clone())
725                            .len(1)
726                            .buffers(vec![runs]);
727
728                        // SAFETY:
729                        // Valid by construction
730                        let runs = unsafe { builder.build_unchecked() };
731                        (
732                            vec![],
733                            vec![runs, ArrayData::new_null(v.data_type(), 1)],
734                            false,
735                        )
736                    }
737                }
738                // Handled by Some(width) branch above
739                DataType::Int8
740                | DataType::Int16
741                | DataType::Int32
742                | DataType::Int64
743                | DataType::UInt8
744                | DataType::UInt16
745                | DataType::UInt32
746                | DataType::UInt64
747                | DataType::Float16
748                | DataType::Float32
749                | DataType::Float64
750                | DataType::Timestamp(_, _)
751                | DataType::Date32
752                | DataType::Date64
753                | DataType::Time32(_)
754                | DataType::Time64(_)
755                | DataType::Duration(_)
756                | DataType::Interval(_)
757                | DataType::Decimal32(_, _)
758                | DataType::Decimal64(_, _)
759                | DataType::Decimal128(_, _)
760                | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
761            },
762        };
763
764        let mut builder = ArrayDataBuilder::new(data_type.clone())
765            .len(len)
766            .buffers(buffers)
767            .child_data(child_data);
768
769        if has_nulls {
770            builder = builder.nulls(Some(NullBuffer::new_null(len)))
771        }
772
773        // SAFETY:
774        // Data valid by construction
775        unsafe { builder.build_unchecked() }
776    }
777
778    /// Returns a new empty [ArrayData] valid for `data_type`.
779    pub fn new_empty(data_type: &DataType) -> Self {
780        Self::new_null(data_type, 0)
781    }
782
783    /// Verifies that the buffers meet the minimum alignment requirements for the data type
784    ///
785    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
786    ///
787    /// This can be useful for when interacting with data sent over IPC or FFI, that may
788    /// not meet the minimum alignment requirements
789    ///
790    /// This also aligns buffers of children data
791    pub fn align_buffers(&mut self) {
792        let layout = layout(&self.data_type);
793        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
794            if let BufferSpec::FixedWidth { alignment, .. } = spec {
795                if buffer.as_ptr().align_offset(*alignment) != 0 {
796                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
797                }
798            }
799        }
800        // align children data recursively
801        for data in self.child_data.iter_mut() {
802            data.align_buffers()
803        }
804    }
805
806    /// "cheap" validation of an `ArrayData`. Ensures buffers are
807    /// sufficiently sized to store `len` + `offset` total elements of
808    /// `data_type` and performs other inexpensive consistency checks.
809    ///
810    /// This check is "cheap" in the sense that it does not validate the
811    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
812    /// are within the bounds of the values buffer).
813    ///
814    /// See [ArrayData::validate_data] to validate fully the offset content
815    /// and the validity of utf8 data
816    pub fn validate(&self) -> Result<(), ArrowError> {
817        // Need at least this mich space in each buffer
818        let len_plus_offset = self.len + self.offset;
819
820        // Check that the data layout conforms to the spec
821        let layout = layout(&self.data_type);
822
823        if !layout.can_contain_null_mask && self.nulls.is_some() {
824            return Err(ArrowError::InvalidArgumentError(format!(
825                "Arrays of type {:?} cannot contain a null bitmask",
826                self.data_type,
827            )));
828        }
829
830        // Check data buffers length for view types and other types
831        if self.buffers.len() < layout.buffers.len()
832            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
833        {
834            return Err(ArrowError::InvalidArgumentError(format!(
835                "Expected {} buffers in array of type {:?}, got {}",
836                layout.buffers.len(),
837                self.data_type,
838                self.buffers.len(),
839            )));
840        }
841
842        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
843            match spec {
844                BufferSpec::FixedWidth {
845                    byte_width,
846                    alignment,
847                } => {
848                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
849
850                    if buffer.len() < min_buffer_size {
851                        return Err(ArrowError::InvalidArgumentError(format!(
852                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
853                            min_buffer_size,
854                            i,
855                            self.data_type,
856                            buffer.len()
857                        )));
858                    }
859
860                    let align_offset = buffer.as_ptr().align_offset(*alignment);
861                    if align_offset != 0 {
862                        return Err(ArrowError::InvalidArgumentError(format!(
863                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
864                            self.data_type,
865                            align_offset.min(alignment - align_offset)
866                        )));
867                    }
868                }
869                BufferSpec::VariableWidth => {
870                    // not cheap to validate (need to look at the
871                    // data). Partially checked in validate_offsets
872                    // called below. Can check with `validate_full`
873                }
874                BufferSpec::BitMap => {
875                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
876                    if buffer.len() < min_buffer_size {
877                        return Err(ArrowError::InvalidArgumentError(format!(
878                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
879                            min_buffer_size,
880                            i,
881                            self.data_type,
882                            buffer.len()
883                        )));
884                    }
885                }
886                BufferSpec::AlwaysNull => {
887                    // Nothing to validate
888                }
889            }
890        }
891
892        // check null bit buffer size
893        if let Some(nulls) = self.nulls() {
894            if nulls.null_count() > self.len {
895                return Err(ArrowError::InvalidArgumentError(format!(
896                    "null_count {} for an array exceeds length of {} elements",
897                    nulls.null_count(),
898                    self.len
899                )));
900            }
901
902            let actual_len = nulls.validity().len();
903            let needed_len = bit_util::ceil(len_plus_offset, 8);
904            if actual_len < needed_len {
905                return Err(ArrowError::InvalidArgumentError(format!(
906                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
907                )));
908            }
909
910            if nulls.len() != self.len {
911                return Err(ArrowError::InvalidArgumentError(format!(
912                    "null buffer incorrect size. got {} expected {}",
913                    nulls.len(),
914                    self.len
915                )));
916            }
917        }
918
919        self.validate_child_data()?;
920
921        // Additional Type specific checks
922        match &self.data_type {
923            DataType::Utf8 | DataType::Binary => {
924                self.validate_offsets::<i32>(self.buffers[1].len())?;
925            }
926            DataType::LargeUtf8 | DataType::LargeBinary => {
927                self.validate_offsets::<i64>(self.buffers[1].len())?;
928            }
929            DataType::Dictionary(key_type, _value_type) => {
930                // At the moment, constructing a DictionaryArray will also check this
931                if !DataType::is_dictionary_key_type(key_type) {
932                    return Err(ArrowError::InvalidArgumentError(format!(
933                        "Dictionary key type must be integer, but was {key_type}"
934                    )));
935                }
936            }
937            DataType::RunEndEncoded(run_ends_type, _) => {
938                if run_ends_type.is_nullable() {
939                    return Err(ArrowError::InvalidArgumentError(
940                        "The nullable should be set to false for the field defining run_ends array.".to_string()
941                    ));
942                }
943                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
944                    return Err(ArrowError::InvalidArgumentError(format!(
945                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
946                        run_ends_type.data_type()
947                    )));
948                }
949            }
950            _ => {}
951        };
952
953        Ok(())
954    }
955
956    /// Returns a reference to the data in `buffer` as a typed slice
957    /// (typically `&[i32]` or `&[i64]`) after validating. The
958    /// returned slice is guaranteed to have at least `self.len + 1`
959    /// entries.
960    ///
961    /// For an empty array, the `buffer` can also be empty.
962    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
963        // An empty list-like array can have 0 offsets
964        if self.len == 0 && self.buffers[0].is_empty() {
965            return Ok(&[]);
966        }
967
968        self.typed_buffer(0, self.len + 1)
969    }
970
971    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
972    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
973        &self,
974        idx: usize,
975        len: usize,
976    ) -> Result<&[T], ArrowError> {
977        let buffer = &self.buffers[idx];
978
979        let required_len = (len + self.offset) * mem::size_of::<T>();
980
981        if buffer.len() < required_len {
982            return Err(ArrowError::InvalidArgumentError(format!(
983                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
984                idx,
985                self.data_type,
986                required_len,
987                buffer.len()
988            )));
989        }
990
991        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
992    }
993
994    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
995    /// offsets (of type T) into some other buffer of `values_length` bytes long
996    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
997        &self,
998        values_length: usize,
999    ) -> Result<(), ArrowError> {
1000        // Justification: buffer size was validated above
1001        let offsets = self.typed_offsets::<T>()?;
1002        if offsets.is_empty() {
1003            return Ok(());
1004        }
1005
1006        let first_offset = offsets[0].to_usize().ok_or_else(|| {
1007            ArrowError::InvalidArgumentError(format!(
1008                "Error converting offset[0] ({}) to usize for {}",
1009                offsets[0], self.data_type
1010            ))
1011        })?;
1012
1013        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1014            ArrowError::InvalidArgumentError(format!(
1015                "Error converting offset[{}] ({}) to usize for {}",
1016                self.len, offsets[self.len], self.data_type
1017            ))
1018        })?;
1019
1020        if first_offset > values_length {
1021            return Err(ArrowError::InvalidArgumentError(format!(
1022                "First offset {} of {} is larger than values length {}",
1023                first_offset, self.data_type, values_length,
1024            )));
1025        }
1026
1027        if last_offset > values_length {
1028            return Err(ArrowError::InvalidArgumentError(format!(
1029                "Last offset {} of {} is larger than values length {}",
1030                last_offset, self.data_type, values_length,
1031            )));
1032        }
1033
1034        if first_offset > last_offset {
1035            return Err(ArrowError::InvalidArgumentError(format!(
1036                "First offset {} in {} is smaller than last offset {}",
1037                first_offset, self.data_type, last_offset,
1038            )));
1039        }
1040
1041        Ok(())
1042    }
1043
1044    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1045    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
1046    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1047        &self,
1048        values_length: usize,
1049    ) -> Result<(), ArrowError> {
1050        let offsets: &[T] = self.typed_buffer(0, self.len)?;
1051        let sizes: &[T] = self.typed_buffer(1, self.len)?;
1052        if offsets.len() != sizes.len() {
1053            return Err(ArrowError::ComputeError(format!(
1054                "ListView offsets len {} does not match sizes len {}",
1055                offsets.len(),
1056                sizes.len()
1057            )));
1058        }
1059
1060        for i in 0..sizes.len() {
1061            let size = sizes[i].to_usize().ok_or_else(|| {
1062                ArrowError::InvalidArgumentError(format!(
1063                    "Error converting size[{}] ({}) to usize for {}",
1064                    i, sizes[i], self.data_type
1065                ))
1066            })?;
1067            let offset = offsets[i].to_usize().ok_or_else(|| {
1068                ArrowError::InvalidArgumentError(format!(
1069                    "Error converting offset[{}] ({}) to usize for {}",
1070                    i, offsets[i], self.data_type
1071                ))
1072            })?;
1073            if size
1074                .checked_add(offset)
1075                .expect("Offset and size have exceeded the usize boundary")
1076                > values_length
1077            {
1078                return Err(ArrowError::InvalidArgumentError(format!(
1079                    "Size {} at index {} is larger than the remaining values for {}",
1080                    size, i, self.data_type
1081                )));
1082            }
1083        }
1084        Ok(())
1085    }
1086
1087    /// Validates the layout of `child_data` ArrayData structures
1088    fn validate_child_data(&self) -> Result<(), ArrowError> {
1089        match &self.data_type {
1090            DataType::List(field) | DataType::Map(field, _) => {
1091                let values_data = self.get_single_valid_child_data(field.data_type())?;
1092                self.validate_offsets::<i32>(values_data.len)?;
1093                Ok(())
1094            }
1095            DataType::LargeList(field) => {
1096                let values_data = self.get_single_valid_child_data(field.data_type())?;
1097                self.validate_offsets::<i64>(values_data.len)?;
1098                Ok(())
1099            }
1100            DataType::ListView(field) => {
1101                let values_data = self.get_single_valid_child_data(field.data_type())?;
1102                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1103                Ok(())
1104            }
1105            DataType::LargeListView(field) => {
1106                let values_data = self.get_single_valid_child_data(field.data_type())?;
1107                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1108                Ok(())
1109            }
1110            DataType::FixedSizeList(field, list_size) => {
1111                let values_data = self.get_single_valid_child_data(field.data_type())?;
1112
1113                let list_size: usize = (*list_size).try_into().map_err(|_| {
1114                    ArrowError::InvalidArgumentError(format!(
1115                        "{} has a negative list_size {}",
1116                        self.data_type, list_size
1117                    ))
1118                })?;
1119
1120                let expected_values_len = self.len
1121                    .checked_mul(list_size)
1122                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1123
1124                if values_data.len < expected_values_len {
1125                    return Err(ArrowError::InvalidArgumentError(format!(
1126                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1127                        values_data.len, self.len, list_size, self.data_type
1128                    )));
1129                }
1130
1131                Ok(())
1132            }
1133            DataType::Struct(fields) => {
1134                self.validate_num_child_data(fields.len())?;
1135                for (i, field) in fields.iter().enumerate() {
1136                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1137
1138                    // Ensure child field has sufficient size
1139                    if field_data.len < self.len {
1140                        return Err(ArrowError::InvalidArgumentError(format!(
1141                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1142                            self.data_type,
1143                            i,
1144                            field.name(),
1145                            field_data.len,
1146                            self.len
1147                        )));
1148                    }
1149                }
1150                Ok(())
1151            }
1152            DataType::RunEndEncoded(run_ends_field, values_field) => {
1153                self.validate_num_child_data(2)?;
1154                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1155                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1156                if run_ends_data.len != values_data.len {
1157                    return Err(ArrowError::InvalidArgumentError(format!(
1158                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1159                        run_ends_data.len, values_data.len
1160                    )));
1161                }
1162                if run_ends_data.nulls.is_some() {
1163                    return Err(ArrowError::InvalidArgumentError(
1164                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1165                    ));
1166                }
1167                Ok(())
1168            }
1169            DataType::Union(fields, mode) => {
1170                self.validate_num_child_data(fields.len())?;
1171
1172                for (i, (_, field)) in fields.iter().enumerate() {
1173                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1174
1175                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1176                        return Err(ArrowError::InvalidArgumentError(format!(
1177                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1178                            i,
1179                            field_data.len,
1180                            self.len + self.offset
1181                        )));
1182                    }
1183                }
1184                Ok(())
1185            }
1186            DataType::Dictionary(_key_type, value_type) => {
1187                self.get_single_valid_child_data(value_type)?;
1188                Ok(())
1189            }
1190            _ => {
1191                // other types do not have child data
1192                if !self.child_data.is_empty() {
1193                    return Err(ArrowError::InvalidArgumentError(format!(
1194                        "Expected no child arrays for type {} but got {}",
1195                        self.data_type,
1196                        self.child_data.len()
1197                    )));
1198                }
1199                Ok(())
1200            }
1201        }
1202    }
1203
1204    /// Ensures that this array data has a single child_data with the
1205    /// expected type, and calls `validate()` on it. Returns a
1206    /// reference to that child_data
1207    fn get_single_valid_child_data(
1208        &self,
1209        expected_type: &DataType,
1210    ) -> Result<&ArrayData, ArrowError> {
1211        self.validate_num_child_data(1)?;
1212        self.get_valid_child_data(0, expected_type)
1213    }
1214
1215    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1216    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1217        if self.child_data.len() != expected_len {
1218            Err(ArrowError::InvalidArgumentError(format!(
1219                "Value data for {} should contain {} child data array(s), had {}",
1220                self.data_type,
1221                expected_len,
1222                self.child_data.len()
1223            )))
1224        } else {
1225            Ok(())
1226        }
1227    }
1228
1229    /// Ensures that `child_data[i]` has the expected type, calls
1230    /// `validate()` on it, and returns a reference to that child_data
1231    fn get_valid_child_data(
1232        &self,
1233        i: usize,
1234        expected_type: &DataType,
1235    ) -> Result<&ArrayData, ArrowError> {
1236        let values_data = self.child_data.get(i).ok_or_else(|| {
1237            ArrowError::InvalidArgumentError(format!(
1238                "{} did not have enough child arrays. Expected at least {} but had only {}",
1239                self.data_type,
1240                i + 1,
1241                self.child_data.len()
1242            ))
1243        })?;
1244
1245        if expected_type != &values_data.data_type {
1246            return Err(ArrowError::InvalidArgumentError(format!(
1247                "Child type mismatch for {}. Expected {} but child data had {}",
1248                self.data_type, expected_type, values_data.data_type
1249            )));
1250        }
1251
1252        values_data.validate()?;
1253        Ok(values_data)
1254    }
1255
1256    /// Validate that the data contained within this [`ArrayData`] is valid
1257    ///
1258    /// 1. Null count is correct
1259    /// 2. All offsets are valid
1260    /// 3. All String data is valid UTF-8
1261    /// 4. All dictionary offsets are valid
1262    ///
1263    /// Internally this calls:
1264    ///
1265    /// * [`Self::validate`]
1266    /// * [`Self::validate_nulls`]
1267    /// * [`Self::validate_values`]
1268    ///
1269    /// Note: this does not recurse into children, for a recursive variant
1270    /// see [`Self::validate_full`]
1271    pub fn validate_data(&self) -> Result<(), ArrowError> {
1272        self.validate()?;
1273
1274        self.validate_nulls()?;
1275        self.validate_values()?;
1276        Ok(())
1277    }
1278
1279    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1280    ///
1281    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1282    /// and all its children recursively
1283    pub fn validate_full(&self) -> Result<(), ArrowError> {
1284        self.validate_data()?;
1285        // validate all children recursively
1286        self.child_data
1287            .iter()
1288            .enumerate()
1289            .try_for_each(|(i, child_data)| {
1290                child_data.validate_full().map_err(|e| {
1291                    ArrowError::InvalidArgumentError(format!(
1292                        "{} child #{} invalid: {}",
1293                        self.data_type, i, e
1294                    ))
1295                })
1296            })?;
1297        Ok(())
1298    }
1299
1300    /// Validates the values stored within this [`ArrayData`] are valid
1301    /// without recursing into child [`ArrayData`]
1302    ///
1303    /// Does not (yet) check
1304    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1305    /// 2. the the null count is correct and that any
1306    /// 3. nullability requirements of its children are correct
1307    ///
1308    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1309    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1310        if let Some(nulls) = &self.nulls {
1311            let actual = nulls.len() - nulls.inner().count_set_bits();
1312            if actual != nulls.null_count() {
1313                return Err(ArrowError::InvalidArgumentError(format!(
1314                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1315                    nulls.null_count(),
1316                    actual
1317                )));
1318            }
1319        }
1320
1321        // In general non-nullable children should not contain nulls, however, for certain
1322        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1323        // space in the child. As such we permit nulls in the children in the corresponding
1324        // positions for such types
1325        match &self.data_type {
1326            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1327                if !f.is_nullable() {
1328                    self.validate_non_nullable(None, &self.child_data[0])?
1329                }
1330            }
1331            DataType::FixedSizeList(field, len) => {
1332                let child = &self.child_data[0];
1333                if !field.is_nullable() {
1334                    match &self.nulls {
1335                        Some(nulls) => {
1336                            let element_len = *len as usize;
1337                            let expanded = nulls.expand(element_len);
1338                            self.validate_non_nullable(Some(&expanded), child)?;
1339                        }
1340                        None => self.validate_non_nullable(None, child)?,
1341                    }
1342                }
1343            }
1344            DataType::Struct(fields) => {
1345                for (field, child) in fields.iter().zip(&self.child_data) {
1346                    if !field.is_nullable() {
1347                        self.validate_non_nullable(self.nulls(), child)?
1348                    }
1349                }
1350            }
1351            _ => {}
1352        }
1353
1354        Ok(())
1355    }
1356
1357    /// Verifies that `child` contains no nulls not present in `mask`
1358    fn validate_non_nullable(
1359        &self,
1360        mask: Option<&NullBuffer>,
1361        child: &ArrayData,
1362    ) -> Result<(), ArrowError> {
1363        let mask = match mask {
1364            Some(mask) => mask,
1365            None => {
1366                return match child.null_count() {
1367                    0 => Ok(()),
1368                    _ => Err(ArrowError::InvalidArgumentError(format!(
1369                        "non-nullable child of type {} contains nulls not present in parent {}",
1370                        child.data_type, self.data_type
1371                    ))),
1372                };
1373            }
1374        };
1375
1376        match child.nulls() {
1377            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1378                "non-nullable child of type {} contains nulls not present in parent",
1379                child.data_type
1380            ))),
1381            _ => Ok(()),
1382        }
1383    }
1384
1385    /// Validates the values stored within this [`ArrayData`] are valid
1386    /// without recursing into child [`ArrayData`]
1387    ///
1388    /// Does not (yet) check
1389    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1390    pub fn validate_values(&self) -> Result<(), ArrowError> {
1391        match &self.data_type {
1392            DataType::Utf8 => self.validate_utf8::<i32>(),
1393            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1394            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1395            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1396            DataType::BinaryView => {
1397                let views = self.typed_buffer::<u128>(0, self.len)?;
1398                validate_binary_view(views, &self.buffers[1..])
1399            }
1400            DataType::Utf8View => {
1401                let views = self.typed_buffer::<u128>(0, self.len)?;
1402                validate_string_view(views, &self.buffers[1..])
1403            }
1404            DataType::List(_) | DataType::Map(_, _) => {
1405                let child = &self.child_data[0];
1406                self.validate_offsets_full::<i32>(child.len)
1407            }
1408            DataType::LargeList(_) => {
1409                let child = &self.child_data[0];
1410                self.validate_offsets_full::<i64>(child.len)
1411            }
1412            DataType::Union(_, _) => {
1413                // Validate Union Array as part of implementing new Union semantics
1414                // See comments in `ArrayData::validate()`
1415                // https://github.com/apache/arrow-rs/issues/85
1416                //
1417                // TODO file follow on ticket for full union validation
1418                Ok(())
1419            }
1420            DataType::Dictionary(key_type, _value_type) => {
1421                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1422                let max_value = dictionary_length - 1;
1423                match key_type.as_ref() {
1424                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1425                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1426                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1427                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1428                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1429                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1430                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1431                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1432                    _ => unreachable!(),
1433                }
1434            }
1435            DataType::RunEndEncoded(run_ends, _values) => {
1436                let run_ends_data = self.child_data()[0].clone();
1437                match run_ends.data_type() {
1438                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1439                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1440                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1441                    _ => unreachable!(),
1442                }
1443            }
1444            _ => {
1445                // No extra validation check required for other types
1446                Ok(())
1447            }
1448        }
1449    }
1450
1451    /// Calls the `validate(item_index, range)` function for each of
1452    /// the ranges specified in the arrow offsets buffer of type
1453    /// `T`. Also validates that each offset is smaller than
1454    /// `offset_limit`
1455    ///
1456    /// For an empty array, the offsets buffer can either be empty
1457    /// or contain a single `0`.
1458    ///
1459    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1460    /// function would call `validate([1,2])`, and `validate([2,4])`
1461    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1462    where
1463        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1464        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1465    {
1466        self.typed_offsets::<T>()?
1467            .iter()
1468            .enumerate()
1469            .map(|(i, x)| {
1470                // check if the offset can be converted to usize
1471                let r = x.to_usize().ok_or_else(|| {
1472                    ArrowError::InvalidArgumentError(format!(
1473                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1474                    );
1475                // check if the offset exceeds the limit
1476                match r {
1477                    Ok(n) if n <= offset_limit => Ok((i, n)),
1478                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1479                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1480                    ),
1481                    Err(e) => Err(e),
1482                }
1483            })
1484            .scan(0_usize, |start, end| {
1485                // check offsets are monotonically increasing
1486                match end {
1487                    Ok((i, end)) if *start <= end => {
1488                        let range = Some(Ok((i, *start..end)));
1489                        *start = end;
1490                        range
1491                    }
1492                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1493                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1494                        i - 1, start, end))
1495                    )),
1496                    Err(err) => Some(Err(err)),
1497                }
1498            })
1499            .skip(1) // the first element is meaningless
1500            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1501                let (item_index, range) = res?;
1502                validate(item_index-1, range)
1503            })
1504    }
1505
1506    /// Ensures that all strings formed by the offsets in `buffers[0]`
1507    /// into `buffers[1]` are valid utf8 sequences
1508    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1509    where
1510        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1511    {
1512        let values_buffer = &self.buffers[1].as_slice();
1513        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1514            // Validate Offsets are correct
1515            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1516                if !values_str.is_char_boundary(range.start)
1517                    || !values_str.is_char_boundary(range.end)
1518                {
1519                    return Err(ArrowError::InvalidArgumentError(format!(
1520                        "incomplete utf-8 byte sequence from index {string_index}"
1521                    )));
1522                }
1523                Ok(())
1524            })
1525        } else {
1526            // find specific offset that failed utf8 validation
1527            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1528                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1529                    ArrowError::InvalidArgumentError(format!(
1530                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1531                    ))
1532                })?;
1533                Ok(())
1534            })
1535        }
1536    }
1537
1538    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1539    /// between `0` and `offset_limit`
1540    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1541    where
1542        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1543    {
1544        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1545            // No validation applied to each value, but the iteration
1546            // itself applies bounds checking to each range
1547            Ok(())
1548        })
1549    }
1550
1551    /// Validates that each value in self.buffers (typed as T)
1552    /// is within the range [0, max_value], inclusive
1553    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1554    where
1555        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1556    {
1557        let required_len = self.len + self.offset;
1558        let buffer = &self.buffers[0];
1559
1560        // This should have been checked as part of `validate()` prior
1561        // to calling `validate_full()` but double check to be sure
1562        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1563
1564        // Justification: buffer size was validated above
1565        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1566
1567        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1568            // Do not check the value is null (value can be arbitrary)
1569            if self.is_null(i) {
1570                return Ok(());
1571            }
1572            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1573                ArrowError::InvalidArgumentError(format!(
1574                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1575                ))
1576            })?;
1577
1578            if dict_index < 0 || dict_index > max_value {
1579                return Err(ArrowError::InvalidArgumentError(format!(
1580                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1581                )));
1582            }
1583            Ok(())
1584        })
1585    }
1586
1587    /// Validates that each value in run_ends array is positive and strictly increasing.
1588    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1589    where
1590        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1591    {
1592        let values = self.typed_buffer::<T>(0, self.len)?;
1593        let mut prev_value: i64 = 0_i64;
1594        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1595            let value: i64 = inp_value.try_into().map_err(|_| {
1596                ArrowError::InvalidArgumentError(format!(
1597                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1598                ))
1599            })?;
1600            if value <= 0_i64 {
1601                return Err(ArrowError::InvalidArgumentError(format!(
1602                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1603                )));
1604            }
1605            if ix > 0 && value <= prev_value {
1606                return Err(ArrowError::InvalidArgumentError(format!(
1607                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1608                )));
1609            }
1610
1611            prev_value = value;
1612            Ok(())
1613        })?;
1614
1615        if prev_value.as_usize() < (self.offset + self.len) {
1616            return Err(ArrowError::InvalidArgumentError(format!(
1617                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1618                self.offset + self.len
1619            )));
1620        }
1621        Ok(())
1622    }
1623
1624    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1625    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1626    /// return false when the arrays are logically equal
1627    pub fn ptr_eq(&self, other: &Self) -> bool {
1628        if self.offset != other.offset
1629            || self.len != other.len
1630            || self.data_type != other.data_type
1631            || self.buffers.len() != other.buffers.len()
1632            || self.child_data.len() != other.child_data.len()
1633        {
1634            return false;
1635        }
1636
1637        match (&self.nulls, &other.nulls) {
1638            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1639            (Some(_), None) | (None, Some(_)) => return false,
1640            _ => {}
1641        };
1642
1643        if !self
1644            .buffers
1645            .iter()
1646            .zip(other.buffers.iter())
1647            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1648        {
1649            return false;
1650        }
1651
1652        self.child_data
1653            .iter()
1654            .zip(other.child_data.iter())
1655            .all(|(a, b)| a.ptr_eq(b))
1656    }
1657
1658    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1659    pub fn into_builder(self) -> ArrayDataBuilder {
1660        self.into()
1661    }
1662
1663    /// Claim memory used by this ArrayData in the provided memory pool.
1664    ///
1665    /// This claims memory for:
1666    /// - All buffers in self.buffers
1667    /// - All child ArrayData recursively
1668    /// - The null buffer if present
1669    #[cfg(feature = "pool")]
1670    pub fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
1671        // Claim all data buffers
1672        for buffer in &self.buffers {
1673            buffer.claim(pool);
1674        }
1675
1676        // Claim null buffer if present
1677        if let Some(nulls) = &self.nulls {
1678            nulls.claim(pool);
1679        }
1680
1681        // Recursively claim child data
1682        for child in &self.child_data {
1683            child.claim(pool);
1684        }
1685    }
1686}
1687
1688/// Return the expected [`DataTypeLayout`] Arrays of this data
1689/// type are expected to have
1690pub fn layout(data_type: &DataType) -> DataTypeLayout {
1691    // based on C/C++ implementation in
1692    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1693    use arrow_schema::IntervalUnit::*;
1694
1695    match data_type {
1696        DataType::Null => DataTypeLayout {
1697            buffers: vec![],
1698            can_contain_null_mask: false,
1699            variadic: false,
1700        },
1701        DataType::Boolean => DataTypeLayout {
1702            buffers: vec![BufferSpec::BitMap],
1703            can_contain_null_mask: true,
1704            variadic: false,
1705        },
1706        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1707        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1708        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1709        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1710        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1711        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1712        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1713        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1714        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1715        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1716        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1717        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1718        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1719        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1720        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1721        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1722        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1723        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1724        DataType::Interval(MonthDayNano) => {
1725            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1726        }
1727        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1728        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1729        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1730        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1731        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1732        DataType::FixedSizeBinary(size) => {
1733            let spec = BufferSpec::FixedWidth {
1734                byte_width: (*size).try_into().unwrap(),
1735                alignment: mem::align_of::<u8>(),
1736            };
1737            DataTypeLayout {
1738                buffers: vec![spec],
1739                can_contain_null_mask: true,
1740                variadic: false,
1741            }
1742        }
1743        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1744        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1745        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1746        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1747        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1748        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1749        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1750        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1751        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1752        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1753        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1754        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1755        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1756        DataType::Union(_, mode) => {
1757            let type_ids = BufferSpec::FixedWidth {
1758                byte_width: mem::size_of::<i8>(),
1759                alignment: mem::align_of::<i8>(),
1760            };
1761
1762            DataTypeLayout {
1763                buffers: match mode {
1764                    UnionMode::Sparse => {
1765                        vec![type_ids]
1766                    }
1767                    UnionMode::Dense => {
1768                        vec![
1769                            type_ids,
1770                            BufferSpec::FixedWidth {
1771                                byte_width: mem::size_of::<i32>(),
1772                                alignment: mem::align_of::<i32>(),
1773                            },
1774                        ]
1775                    }
1776                },
1777                can_contain_null_mask: false,
1778                variadic: false,
1779            }
1780        }
1781        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1782    }
1783}
1784
1785/// Layout specification for a data type
1786#[derive(Debug, PartialEq, Eq)]
1787// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1788pub struct DataTypeLayout {
1789    /// A vector of buffer layout specifications, one for each expected buffer
1790    pub buffers: Vec<BufferSpec>,
1791
1792    /// Can contain a null bitmask
1793    pub can_contain_null_mask: bool,
1794
1795    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1796    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1797    /// buffers.len(). Buffers that exceed the lower bound are legal.
1798    pub variadic: bool,
1799}
1800
1801impl DataTypeLayout {
1802    /// Describes a basic numeric array where each element has type `T`
1803    pub fn new_fixed_width<T>() -> Self {
1804        Self {
1805            buffers: vec![BufferSpec::FixedWidth {
1806                byte_width: mem::size_of::<T>(),
1807                alignment: mem::align_of::<T>(),
1808            }],
1809            can_contain_null_mask: true,
1810            variadic: false,
1811        }
1812    }
1813
1814    /// Describes arrays which have no data of their own
1815    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1816    pub fn new_nullable_empty() -> Self {
1817        Self {
1818            buffers: vec![],
1819            can_contain_null_mask: true,
1820            variadic: false,
1821        }
1822    }
1823
1824    /// Describes arrays which have no data of their own
1825    /// (e.g. RunEndEncoded).
1826    pub fn new_empty() -> Self {
1827        Self {
1828            buffers: vec![],
1829            can_contain_null_mask: false,
1830            variadic: false,
1831        }
1832    }
1833
1834    /// Describes a basic numeric array where each element has a fixed
1835    /// with offset buffer of type `T`, followed by a
1836    /// variable width data buffer
1837    pub fn new_binary<T>() -> Self {
1838        Self {
1839            buffers: vec![
1840                // offsets
1841                BufferSpec::FixedWidth {
1842                    byte_width: mem::size_of::<T>(),
1843                    alignment: mem::align_of::<T>(),
1844                },
1845                // values
1846                BufferSpec::VariableWidth,
1847            ],
1848            can_contain_null_mask: true,
1849            variadic: false,
1850        }
1851    }
1852
1853    /// Describes a view type
1854    pub fn new_view() -> Self {
1855        Self {
1856            buffers: vec![BufferSpec::FixedWidth {
1857                byte_width: mem::size_of::<u128>(),
1858                alignment: mem::align_of::<u128>(),
1859            }],
1860            can_contain_null_mask: true,
1861            variadic: true,
1862        }
1863    }
1864
1865    /// Describes a list view type
1866    pub fn new_list_view<T>() -> Self {
1867        Self {
1868            buffers: vec![
1869                BufferSpec::FixedWidth {
1870                    byte_width: mem::size_of::<T>(),
1871                    alignment: mem::align_of::<T>(),
1872                },
1873                BufferSpec::FixedWidth {
1874                    byte_width: mem::size_of::<T>(),
1875                    alignment: mem::align_of::<T>(),
1876                },
1877            ],
1878            can_contain_null_mask: true,
1879            variadic: false,
1880        }
1881    }
1882}
1883
1884/// Layout specification for a single data type buffer
1885#[derive(Debug, PartialEq, Eq)]
1886pub enum BufferSpec {
1887    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1888    ///
1889    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1890    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1891    ///
1892    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1893    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1894    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1895    ///
1896    /// Note that these alignment requirements will vary between architectures
1897    FixedWidth {
1898        /// The width of each element in bytes
1899        byte_width: usize,
1900        /// The alignment required by Rust for an array of the corresponding primitive
1901        alignment: usize,
1902    },
1903    /// Variable width, such as string data for utf8 data
1904    VariableWidth,
1905    /// Buffer holds a bitmap.
1906    ///
1907    /// Note: Unlike the C++ implementation, the null/validity buffer
1908    /// is handled specially rather than as another of the buffers in
1909    /// the spec, so this variant is only used for the Boolean type.
1910    BitMap,
1911    /// Buffer is always null. Unused currently in Rust implementation,
1912    /// (used in C++ for Union type)
1913    #[allow(dead_code)]
1914    AlwaysNull,
1915}
1916
1917impl PartialEq for ArrayData {
1918    fn eq(&self, other: &Self) -> bool {
1919        equal::equal(self, other)
1920    }
1921}
1922
1923/// A boolean flag that cannot be mutated outside of unsafe code.
1924///
1925/// Defaults to a value of false.
1926///
1927/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1928///
1929/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1930///
1931/// # Example
1932/// ```rust
1933/// use arrow_data::UnsafeFlag;
1934/// assert!(!UnsafeFlag::default().get()); // default is false
1935/// let mut flag = UnsafeFlag::new();
1936/// assert!(!flag.get()); // defaults to false
1937/// // can only set it to true in unsafe code
1938/// unsafe { flag.set(true) };
1939/// assert!(flag.get()); // now true
1940/// ```
1941#[derive(Debug, Clone)]
1942#[doc(hidden)]
1943pub struct UnsafeFlag(bool);
1944
1945impl UnsafeFlag {
1946    /// Creates a new `UnsafeFlag` with the value set to `false`.
1947    ///
1948    /// See examples on [`Self::new`]
1949    #[inline]
1950    pub const fn new() -> Self {
1951        Self(false)
1952    }
1953
1954    /// Sets the value of the flag to the given value
1955    ///
1956    /// Note this can purposely only be done in `unsafe` code
1957    ///
1958    /// # Safety
1959    ///
1960    /// If set, the flag will be set to the given value. There is nothing
1961    /// immediately unsafe about doing so, however, the flag can be used to
1962    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1963    #[inline]
1964    pub unsafe fn set(&mut self, val: bool) {
1965        self.0 = val;
1966    }
1967
1968    /// Returns the value of the flag
1969    #[inline]
1970    pub fn get(&self) -> bool {
1971        self.0
1972    }
1973}
1974
1975// Manual impl to make it clear you can not construct unsafe with true
1976impl Default for UnsafeFlag {
1977    fn default() -> Self {
1978        Self::new()
1979    }
1980}
1981
1982/// Builder for [`ArrayData`] type
1983#[derive(Debug)]
1984pub struct ArrayDataBuilder {
1985    data_type: DataType,
1986    len: usize,
1987    null_count: Option<usize>,
1988    null_bit_buffer: Option<Buffer>,
1989    nulls: Option<NullBuffer>,
1990    offset: usize,
1991    buffers: Vec<Buffer>,
1992    child_data: Vec<ArrayData>,
1993    /// Should buffers be realigned (copying if necessary)?
1994    ///
1995    /// Defaults to false.
1996    align_buffers: bool,
1997    /// Should data validation be skipped for this [`ArrayData`]?
1998    ///
1999    /// Defaults to false.
2000    ///
2001    /// # Safety
2002    ///
2003    /// This flag can only be set to true using `unsafe` APIs. However, once true
2004    /// subsequent calls to `build()` may result in undefined behavior if the data
2005    /// is not valid.
2006    skip_validation: UnsafeFlag,
2007}
2008
2009impl ArrayDataBuilder {
2010    #[inline]
2011    /// Creates a new array data builder
2012    pub const fn new(data_type: DataType) -> Self {
2013        Self {
2014            data_type,
2015            len: 0,
2016            null_count: None,
2017            null_bit_buffer: None,
2018            nulls: None,
2019            offset: 0,
2020            buffers: vec![],
2021            child_data: vec![],
2022            align_buffers: false,
2023            skip_validation: UnsafeFlag::new(),
2024        }
2025    }
2026
2027    /// Creates a new array data builder from an existing one, changing the data type
2028    pub fn data_type(self, data_type: DataType) -> Self {
2029        Self { data_type, ..self }
2030    }
2031
2032    #[inline]
2033    #[allow(clippy::len_without_is_empty)]
2034    /// Sets the length of the [ArrayData]
2035    pub const fn len(mut self, n: usize) -> Self {
2036        self.len = n;
2037        self
2038    }
2039
2040    /// Sets the null buffer of the [ArrayData]
2041    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2042        self.nulls = nulls;
2043        self.null_count = None;
2044        self.null_bit_buffer = None;
2045        self
2046    }
2047
2048    /// Sets the null count of the [ArrayData]
2049    pub fn null_count(mut self, null_count: usize) -> Self {
2050        self.null_count = Some(null_count);
2051        self
2052    }
2053
2054    /// Sets the `null_bit_buffer` of the [ArrayData]
2055    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2056        self.nulls = None;
2057        self.null_bit_buffer = buf;
2058        self
2059    }
2060
2061    /// Sets the offset of the [ArrayData]
2062    #[inline]
2063    pub const fn offset(mut self, n: usize) -> Self {
2064        self.offset = n;
2065        self
2066    }
2067
2068    /// Sets the buffers of the [ArrayData]
2069    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2070        self.buffers = v;
2071        self
2072    }
2073
2074    /// Adds a single buffer to the [ArrayData]'s buffers
2075    pub fn add_buffer(mut self, b: Buffer) -> Self {
2076        self.buffers.push(b);
2077        self
2078    }
2079
2080    /// Adds multiple buffers to the [ArrayData]'s buffers
2081    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2082        self.buffers.extend(bs);
2083        self
2084    }
2085
2086    /// Sets the child data of the [ArrayData]
2087    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2088        self.child_data = v;
2089        self
2090    }
2091
2092    /// Adds a single child data to the [ArrayData]'s child data
2093    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2094        self.child_data.push(r);
2095        self
2096    }
2097
2098    /// Creates an array data, without any validation
2099    ///
2100    /// Note: This is shorthand for
2101    /// ```rust
2102    /// # #[expect(unsafe_op_in_unsafe_fn)]
2103    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2104    /// # let _ = unsafe {
2105    /// builder.skip_validation(true).build().unwrap()
2106    /// # };
2107    /// ```
2108    ///
2109    /// # Safety
2110    ///
2111    /// The same caveats as [`ArrayData::new_unchecked`]
2112    /// apply.
2113    pub unsafe fn build_unchecked(self) -> ArrayData {
2114        unsafe { self.skip_validation(true) }.build().unwrap()
2115    }
2116
2117    /// Creates an `ArrayData`, consuming `self`
2118    ///
2119    /// # Safety
2120    ///
2121    /// By default the underlying buffers are checked to ensure they are valid
2122    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2123    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2124    /// not valid, undefined behavior will result.
2125    pub fn build(self) -> Result<ArrayData, ArrowError> {
2126        let Self {
2127            data_type,
2128            len,
2129            null_count,
2130            null_bit_buffer,
2131            nulls,
2132            offset,
2133            buffers,
2134            child_data,
2135            align_buffers,
2136            skip_validation,
2137        } = self;
2138
2139        let nulls = nulls
2140            .or_else(|| {
2141                let buffer = null_bit_buffer?;
2142                let buffer = BooleanBuffer::new(buffer, offset, len);
2143                Some(match null_count {
2144                    Some(n) => {
2145                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2146                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2147                    }
2148                    None => NullBuffer::new(buffer),
2149                })
2150            })
2151            .filter(|b| b.null_count() != 0);
2152
2153        let mut data = ArrayData {
2154            data_type,
2155            len,
2156            offset,
2157            buffers,
2158            child_data,
2159            nulls,
2160        };
2161
2162        if align_buffers {
2163            data.align_buffers();
2164        }
2165
2166        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2167        if !skip_validation.get() || cfg!(feature = "force_validate") {
2168            data.validate_data()?;
2169        }
2170        Ok(data)
2171    }
2172
2173    /// Creates an array data, validating all inputs, and aligning any buffers
2174    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2175    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2176        self.align_buffers(true).build()
2177    }
2178
2179    /// Ensure that all buffers are aligned, copying data if necessary
2180    ///
2181    /// Rust requires that arrays are aligned to their corresponding primitive,
2182    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2183    ///
2184    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2185    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2186    ///
2187    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2188    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2189    /// when necessary, making it useful when interacting with buffers produced by other systems,
2190    /// e.g. IPC or FFI.
2191    ///
2192    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2193    /// insufficiently aligned buffers.
2194    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2195        self.align_buffers = align_buffers;
2196        self
2197    }
2198
2199    /// Skips validation of the data.
2200    ///
2201    /// If this flag is enabled, `[Self::build`] will skip validation of the
2202    /// data
2203    ///
2204    /// If this flag is not enabled, `[Self::build`] will validate that all
2205    /// buffers are valid and will return an error if any data is invalid.
2206    /// Validation can be expensive.
2207    ///
2208    /// # Safety
2209    ///
2210    /// If validation is skipped, the buffers must form a valid Arrow array,
2211    /// otherwise undefined behavior will result
2212    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2213        unsafe {
2214            self.skip_validation.set(skip_validation);
2215        }
2216        self
2217    }
2218}
2219
2220impl From<ArrayData> for ArrayDataBuilder {
2221    fn from(d: ArrayData) -> Self {
2222        Self {
2223            data_type: d.data_type,
2224            len: d.len,
2225            offset: d.offset,
2226            buffers: d.buffers,
2227            child_data: d.child_data,
2228            nulls: d.nulls,
2229            null_bit_buffer: None,
2230            null_count: None,
2231            align_buffers: false,
2232            skip_validation: UnsafeFlag::new(),
2233        }
2234    }
2235}
2236
2237#[cfg(test)]
2238mod tests {
2239    use super::*;
2240    use arrow_schema::{Field, Fields};
2241
2242    // See arrow/tests/array_data_validation.rs for test of array validation
2243
2244    /// returns a buffer initialized with some constant value for tests
2245    fn make_i32_buffer(n: usize) -> Buffer {
2246        Buffer::from_slice_ref(vec![42i32; n])
2247    }
2248
2249    /// returns a buffer initialized with some constant value for tests
2250    fn make_f32_buffer(n: usize) -> Buffer {
2251        Buffer::from_slice_ref(vec![42f32; n])
2252    }
2253
2254    #[test]
2255    fn test_builder() {
2256        // Buffer needs to be at least 25 long
2257        let v = (0..25).collect::<Vec<i32>>();
2258        let b1 = Buffer::from_slice_ref(&v);
2259        let arr_data = ArrayData::builder(DataType::Int32)
2260            .len(20)
2261            .offset(5)
2262            .add_buffer(b1)
2263            .null_bit_buffer(Some(Buffer::from([
2264                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2265            ])))
2266            .build()
2267            .unwrap();
2268
2269        assert_eq!(20, arr_data.len());
2270        assert_eq!(10, arr_data.null_count());
2271        assert_eq!(5, arr_data.offset());
2272        assert_eq!(1, arr_data.buffers().len());
2273        assert_eq!(
2274            Buffer::from_slice_ref(&v).as_slice(),
2275            arr_data.buffers()[0].as_slice()
2276        );
2277    }
2278
2279    #[test]
2280    fn test_builder_with_child_data() {
2281        let child_arr_data = ArrayData::try_new(
2282            DataType::Int32,
2283            5,
2284            None,
2285            0,
2286            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2287            vec![],
2288        )
2289        .unwrap();
2290
2291        let field = Arc::new(Field::new("x", DataType::Int32, true));
2292        let data_type = DataType::Struct(vec![field].into());
2293
2294        let arr_data = ArrayData::builder(data_type)
2295            .len(5)
2296            .offset(0)
2297            .add_child_data(child_arr_data.clone())
2298            .build()
2299            .unwrap();
2300
2301        assert_eq!(5, arr_data.len());
2302        assert_eq!(1, arr_data.child_data().len());
2303        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2304    }
2305
2306    #[test]
2307    fn test_null_count() {
2308        let mut bit_v: [u8; 2] = [0; 2];
2309        bit_util::set_bit(&mut bit_v, 0);
2310        bit_util::set_bit(&mut bit_v, 3);
2311        bit_util::set_bit(&mut bit_v, 10);
2312        let arr_data = ArrayData::builder(DataType::Int32)
2313            .len(16)
2314            .add_buffer(make_i32_buffer(16))
2315            .null_bit_buffer(Some(Buffer::from(bit_v)))
2316            .build()
2317            .unwrap();
2318        assert_eq!(13, arr_data.null_count());
2319
2320        // Test with offset
2321        let mut bit_v: [u8; 2] = [0; 2];
2322        bit_util::set_bit(&mut bit_v, 0);
2323        bit_util::set_bit(&mut bit_v, 3);
2324        bit_util::set_bit(&mut bit_v, 10);
2325        let arr_data = ArrayData::builder(DataType::Int32)
2326            .len(12)
2327            .offset(2)
2328            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2329            .null_bit_buffer(Some(Buffer::from(bit_v)))
2330            .build()
2331            .unwrap();
2332        assert_eq!(10, arr_data.null_count());
2333    }
2334
2335    #[test]
2336    fn test_null_buffer_ref() {
2337        let mut bit_v: [u8; 2] = [0; 2];
2338        bit_util::set_bit(&mut bit_v, 0);
2339        bit_util::set_bit(&mut bit_v, 3);
2340        bit_util::set_bit(&mut bit_v, 10);
2341        let arr_data = ArrayData::builder(DataType::Int32)
2342            .len(16)
2343            .add_buffer(make_i32_buffer(16))
2344            .null_bit_buffer(Some(Buffer::from(bit_v)))
2345            .build()
2346            .unwrap();
2347        assert!(arr_data.nulls().is_some());
2348        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2349    }
2350
2351    #[test]
2352    fn test_slice() {
2353        let mut bit_v: [u8; 2] = [0; 2];
2354        bit_util::set_bit(&mut bit_v, 0);
2355        bit_util::set_bit(&mut bit_v, 3);
2356        bit_util::set_bit(&mut bit_v, 10);
2357        let data = ArrayData::builder(DataType::Int32)
2358            .len(16)
2359            .add_buffer(make_i32_buffer(16))
2360            .null_bit_buffer(Some(Buffer::from(bit_v)))
2361            .build()
2362            .unwrap();
2363        let new_data = data.slice(1, 15);
2364        assert_eq!(data.len() - 1, new_data.len());
2365        assert_eq!(1, new_data.offset());
2366        assert_eq!(data.null_count(), new_data.null_count());
2367
2368        // slice of a slice (removes one null)
2369        let new_data = new_data.slice(1, 14);
2370        assert_eq!(data.len() - 2, new_data.len());
2371        assert_eq!(2, new_data.offset());
2372        assert_eq!(data.null_count() - 1, new_data.null_count());
2373    }
2374
2375    #[test]
2376    fn test_equality() {
2377        let int_data = ArrayData::builder(DataType::Int32)
2378            .len(1)
2379            .add_buffer(make_i32_buffer(1))
2380            .build()
2381            .unwrap();
2382
2383        let float_data = ArrayData::builder(DataType::Float32)
2384            .len(1)
2385            .add_buffer(make_f32_buffer(1))
2386            .build()
2387            .unwrap();
2388        assert_ne!(int_data, float_data);
2389        assert!(!int_data.ptr_eq(&float_data));
2390        assert!(int_data.ptr_eq(&int_data));
2391
2392        #[allow(clippy::redundant_clone)]
2393        let int_data_clone = int_data.clone();
2394        assert_eq!(int_data, int_data_clone);
2395        assert!(int_data.ptr_eq(&int_data_clone));
2396        assert!(int_data_clone.ptr_eq(&int_data));
2397
2398        let int_data_slice = int_data_clone.slice(1, 0);
2399        assert!(int_data_slice.ptr_eq(&int_data_slice));
2400        assert!(!int_data.ptr_eq(&int_data_slice));
2401        assert!(!int_data_slice.ptr_eq(&int_data));
2402
2403        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2404        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2405        let string_data = ArrayData::try_new(
2406            DataType::Utf8,
2407            3,
2408            Some(Buffer::from_iter(vec![true, false, true])),
2409            0,
2410            vec![offsets_buffer, data_buffer],
2411            vec![],
2412        )
2413        .unwrap();
2414
2415        assert_ne!(float_data, string_data);
2416        assert!(!float_data.ptr_eq(&string_data));
2417
2418        assert!(string_data.ptr_eq(&string_data));
2419
2420        #[allow(clippy::redundant_clone)]
2421        let string_data_cloned = string_data.clone();
2422        assert!(string_data_cloned.ptr_eq(&string_data));
2423        assert!(string_data.ptr_eq(&string_data_cloned));
2424
2425        let string_data_slice = string_data.slice(1, 2);
2426        assert!(string_data_slice.ptr_eq(&string_data_slice));
2427        assert!(!string_data_slice.ptr_eq(&string_data))
2428    }
2429
2430    #[test]
2431    fn test_slice_memory_size() {
2432        let mut bit_v: [u8; 2] = [0; 2];
2433        bit_util::set_bit(&mut bit_v, 0);
2434        bit_util::set_bit(&mut bit_v, 3);
2435        bit_util::set_bit(&mut bit_v, 10);
2436        let data = ArrayData::builder(DataType::Int32)
2437            .len(16)
2438            .add_buffer(make_i32_buffer(16))
2439            .null_bit_buffer(Some(Buffer::from(bit_v)))
2440            .build()
2441            .unwrap();
2442        let new_data = data.slice(1, 14);
2443        assert_eq!(
2444            data.get_slice_memory_size().unwrap() - 8,
2445            new_data.get_slice_memory_size().unwrap()
2446        );
2447        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2448        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2449        let string_data = ArrayData::try_new(
2450            DataType::Utf8,
2451            3,
2452            Some(Buffer::from_iter(vec![true, false, true])),
2453            0,
2454            vec![offsets_buffer, data_buffer],
2455            vec![],
2456        )
2457        .unwrap();
2458        let string_data_slice = string_data.slice(1, 2);
2459        //4 bytes of offset and 2 bytes of data reduced by slicing.
2460        assert_eq!(
2461            string_data.get_slice_memory_size().unwrap() - 6,
2462            string_data_slice.get_slice_memory_size().unwrap()
2463        );
2464    }
2465
2466    #[test]
2467    fn test_count_nulls() {
2468        let buffer = Buffer::from([0b00010110, 0b10011111]);
2469        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2470        let count = count_nulls(Some(&buffer), 0, 16);
2471        assert_eq!(count, 7);
2472
2473        let count = count_nulls(Some(&buffer), 4, 8);
2474        assert_eq!(count, 3);
2475    }
2476
2477    #[test]
2478    fn test_contains_nulls() {
2479        let buffer: Buffer =
2480            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2481        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2482        assert!(contains_nulls(Some(&buffer), 0, 6));
2483        assert!(contains_nulls(Some(&buffer), 0, 3));
2484        assert!(!contains_nulls(Some(&buffer), 3, 2));
2485        assert!(!contains_nulls(Some(&buffer), 0, 0));
2486    }
2487
2488    #[test]
2489    fn test_alignment() {
2490        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2491        let sliced = buffer.slice(1);
2492
2493        let mut data = ArrayData {
2494            data_type: DataType::Int32,
2495            len: 0,
2496            offset: 0,
2497            buffers: vec![buffer],
2498            child_data: vec![],
2499            nulls: None,
2500        };
2501        data.validate_full().unwrap();
2502
2503        // break alignment in data
2504        data.buffers[0] = sliced;
2505        let err = data.validate().unwrap_err();
2506
2507        assert_eq!(
2508            err.to_string(),
2509            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2510        );
2511
2512        data.align_buffers();
2513        data.validate_full().unwrap();
2514    }
2515
2516    #[test]
2517    fn test_alignment_struct() {
2518        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2519        let sliced = buffer.slice(1);
2520
2521        let child_data = ArrayData {
2522            data_type: DataType::Int32,
2523            len: 0,
2524            offset: 0,
2525            buffers: vec![buffer],
2526            child_data: vec![],
2527            nulls: None,
2528        };
2529
2530        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2531        let mut data = ArrayData {
2532            data_type: schema,
2533            len: 0,
2534            offset: 0,
2535            buffers: vec![],
2536            child_data: vec![child_data],
2537            nulls: None,
2538        };
2539        data.validate_full().unwrap();
2540
2541        // break alignment in child data
2542        data.child_data[0].buffers[0] = sliced;
2543        let err = data.validate().unwrap_err();
2544
2545        assert_eq!(
2546            err.to_string(),
2547            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2548        );
2549
2550        data.align_buffers();
2551        data.validate_full().unwrap();
2552    }
2553
2554    #[test]
2555    fn test_null_view_types() {
2556        let array_len = 32;
2557        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2558        assert_eq!(array.len(), array_len);
2559        for i in 0..array.len() {
2560            assert!(array.is_null(i));
2561        }
2562
2563        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2564        assert_eq!(array.len(), array_len);
2565        for i in 0..array.len() {
2566            assert!(array.is_null(i));
2567        }
2568
2569        let array = ArrayData::new_null(
2570            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2571            array_len,
2572        );
2573        assert_eq!(array.len(), array_len);
2574        for i in 0..array.len() {
2575            assert!(array.is_null(i));
2576        }
2577
2578        let array = ArrayData::new_null(
2579            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2580            array_len,
2581        );
2582        assert_eq!(array.len(), array_len);
2583        for i in 0..array.len() {
2584            assert!(array.is_null(i));
2585        }
2586    }
2587}