arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257    /// Create a new ArrayData instance;
258    ///
259    /// If `null_count` is not specified, the number of nulls in
260    /// null_bit_buffer is calculated.
261    ///
262    /// If the number of nulls is 0 then the null_bit_buffer
263    /// is set to `None`.
264    ///
265    /// # Safety
266    ///
267    /// The input values *must* form a valid Arrow array for
268    /// `data_type`, or undefined behavior can result.
269    ///
270    /// Note: This is a low level API and most users of the arrow
271    /// crate should create arrays using the methods in the `array`
272    /// module.
273    pub unsafe fn new_unchecked(
274        data_type: DataType,
275        len: usize,
276        null_count: Option<usize>,
277        null_bit_buffer: Option<Buffer>,
278        offset: usize,
279        buffers: Vec<Buffer>,
280        child_data: Vec<ArrayData>,
281    ) -> Self {
282        let mut skip_validation = UnsafeFlag::new();
283        // SAFETY: caller responsible for ensuring data is valid
284        unsafe { skip_validation.set(true) };
285
286        ArrayDataBuilder {
287            data_type,
288            len,
289            null_count,
290            null_bit_buffer,
291            nulls: None,
292            offset,
293            buffers,
294            child_data,
295            align_buffers: false,
296            skip_validation,
297        }
298        .build()
299        .unwrap()
300    }
301
302    /// Create a new ArrayData, validating that the provided buffers form a valid
303    /// Arrow array of the specified data type.
304    ///
305    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306    /// is set to `None`.
307    ///
308    /// Internally this calls through to [`Self::validate_data`]
309    ///
310    /// Note: This is a low level API and most users of the arrow crate should create
311    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312    /// or [`ArrayDataBuilder`].
313    ///
314    /// See also [`Self::into_parts`] to recover the fields
315    pub fn try_new(
316        data_type: DataType,
317        len: usize,
318        null_bit_buffer: Option<Buffer>,
319        offset: usize,
320        buffers: Vec<Buffer>,
321        child_data: Vec<ArrayData>,
322    ) -> Result<Self, ArrowError> {
323        // we must check the length of `null_bit_buffer` first
324        // because we use this buffer to calculate `null_count`
325        // in `Self::new_unchecked`.
326        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
327            let needed_len = bit_util::ceil(len + offset, 8);
328            if null_bit_buffer.len() < needed_len {
329                return Err(ArrowError::InvalidArgumentError(format!(
330                    "null_bit_buffer size too small. got {} needed {}",
331                    null_bit_buffer.len(),
332                    needed_len
333                )));
334            }
335        }
336        // Safety justification: `validate_full` is called below
337        let new_self = unsafe {
338            Self::new_unchecked(
339                data_type,
340                len,
341                None,
342                null_bit_buffer,
343                offset,
344                buffers,
345                child_data,
346            )
347        };
348
349        // As the data is not trusted, do a full validation of its contents
350        // We don't need to validate children as we can assume that the
351        // [`ArrayData`] in `child_data` have already been validated through
352        // a call to `ArrayData::try_new` or created using unsafe
353        new_self.validate_data()?;
354        Ok(new_self)
355    }
356
357    /// Return the constituent parts of this ArrayData
358    ///
359    /// This is the inverse of [`ArrayData::try_new`].
360    ///
361    /// Returns `(data_type, len, nulls, offset, buffers, child_data)`
362    pub fn into_parts(
363        self,
364    ) -> (
365        DataType,
366        usize,
367        Option<NullBuffer>,
368        usize,
369        Vec<Buffer>,
370        Vec<ArrayData>,
371    ) {
372        let Self {
373            data_type,
374            len,
375            nulls,
376            offset,
377            buffers,
378            child_data,
379        } = self;
380
381        (data_type, len, nulls, offset, buffers, child_data)
382    }
383
384    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
385    #[inline]
386    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
387        ArrayDataBuilder::new(data_type)
388    }
389
390    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
391    #[inline]
392    pub const fn data_type(&self) -> &DataType {
393        &self.data_type
394    }
395
396    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
397    pub fn buffers(&self) -> &[Buffer] {
398        &self.buffers
399    }
400
401    /// Returns a slice of children [`ArrayData`]. This will be non
402    /// empty for type such as lists and structs.
403    pub fn child_data(&self) -> &[ArrayData] {
404        &self.child_data[..]
405    }
406
407    /// Returns whether the element at index `i` is null
408    #[inline]
409    pub fn is_null(&self, i: usize) -> bool {
410        match &self.nulls {
411            Some(v) => v.is_null(i),
412            None => false,
413        }
414    }
415
416    /// Returns a reference to the null buffer of this [`ArrayData`] if any
417    ///
418    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
419    #[inline]
420    pub fn nulls(&self) -> Option<&NullBuffer> {
421        self.nulls.as_ref()
422    }
423
424    /// Returns whether the element at index `i` is not null
425    #[inline]
426    pub fn is_valid(&self, i: usize) -> bool {
427        !self.is_null(i)
428    }
429
430    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
431    #[inline]
432    pub const fn len(&self) -> usize {
433        self.len
434    }
435
436    /// Returns whether this [`ArrayData`] is empty
437    #[inline]
438    pub const fn is_empty(&self) -> bool {
439        self.len == 0
440    }
441
442    /// Returns the offset of this [`ArrayData`]
443    #[inline]
444    pub const fn offset(&self) -> usize {
445        self.offset
446    }
447
448    /// Returns the total number of nulls in this array
449    #[inline]
450    pub fn null_count(&self) -> usize {
451        self.nulls
452            .as_ref()
453            .map(|x| x.null_count())
454            .unwrap_or_default()
455    }
456
457    /// Returns the total number of bytes of memory occupied by the
458    /// buffers owned by this [`ArrayData`] and all of its
459    /// children. (See also diagram on [`ArrayData`]).
460    ///
461    /// Note that this [`ArrayData`] may only refer to a subset of the
462    /// data in the underlying [`Buffer`]s (due to `offset` and
463    /// `length`), but the size returned includes the entire size of
464    /// the buffers.
465    ///
466    /// If multiple [`ArrayData`]s refer to the same underlying
467    /// [`Buffer`]s they will both report the same size.
468    pub fn get_buffer_memory_size(&self) -> usize {
469        let mut size = 0;
470        for buffer in &self.buffers {
471            size += buffer.capacity();
472        }
473        if let Some(bitmap) = &self.nulls {
474            size += bitmap.buffer().capacity()
475        }
476        for child in &self.child_data {
477            size += child.get_buffer_memory_size();
478        }
479        size
480    }
481
482    /// Returns the total number of the bytes of memory occupied by
483    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
484    ///
485    /// This is approximately the number of bytes if a new
486    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
487    /// exactly the data needed.
488    ///
489    /// For example, a [`DataType::Int64`] with `100` elements,
490    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
491    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
492    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
493    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
494    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
495        let mut result: usize = 0;
496        let layout = layout(&self.data_type);
497
498        for spec in layout.buffers.iter() {
499            match spec {
500                BufferSpec::FixedWidth { byte_width, .. } => {
501                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
502                        ArrowError::ComputeError(
503                            "Integer overflow computing buffer size".to_string(),
504                        )
505                    })?;
506                    result += buffer_size;
507                }
508                BufferSpec::VariableWidth => {
509                    let buffer_len = match self.data_type {
510                        DataType::Utf8 | DataType::Binary => {
511                            let offsets = self.typed_offsets::<i32>()?;
512                            (offsets[self.len] - offsets[0]) as usize
513                        }
514                        DataType::LargeUtf8 | DataType::LargeBinary => {
515                            let offsets = self.typed_offsets::<i64>()?;
516                            (offsets[self.len] - offsets[0]) as usize
517                        }
518                        _ => {
519                            return Err(ArrowError::NotYetImplemented(format!(
520                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
521                                self.data_type
522                            )));
523                        }
524                    };
525                    result += buffer_len;
526                }
527                BufferSpec::BitMap => {
528                    let buffer_size = bit_util::ceil(self.len, 8);
529                    result += buffer_size;
530                }
531                BufferSpec::AlwaysNull => {
532                    // Nothing to do
533                }
534            }
535        }
536
537        if self.nulls().is_some() {
538            result += bit_util::ceil(self.len, 8);
539        }
540
541        for child in &self.child_data {
542            result += child.get_slice_memory_size()?;
543        }
544        Ok(result)
545    }
546
547    /// Returns the total number of bytes of memory occupied
548    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
549    /// children. (See also diagram on [`ArrayData`]).
550    ///
551    /// Equivalent to:
552    ///  `size_of_val(self)` +
553    ///  [`Self::get_buffer_memory_size`] +
554    ///  `size_of_val(child)` for all children
555    pub fn get_array_memory_size(&self) -> usize {
556        let mut size = mem::size_of_val(self);
557
558        // Calculate rest of the fields top down which contain actual data
559        for buffer in &self.buffers {
560            size += mem::size_of::<Buffer>();
561            size += buffer.capacity();
562        }
563        if let Some(nulls) = &self.nulls {
564            size += nulls.buffer().capacity();
565        }
566        for child in &self.child_data {
567            size += child.get_array_memory_size();
568        }
569
570        size
571    }
572
573    /// Creates a zero-copy slice of itself. This creates a new
574    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
575    /// different offset and len
576    ///
577    /// # Panics
578    ///
579    /// Panics if `offset + length > self.len()`.
580    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
581        assert!((offset + length) <= self.len());
582
583        if let DataType::Struct(_) = self.data_type() {
584            // Slice into children
585            let new_offset = self.offset + offset;
586            ArrayData {
587                data_type: self.data_type().clone(),
588                len: length,
589                offset: new_offset,
590                buffers: self.buffers.clone(),
591                // Slice child data, to propagate offsets down to them
592                child_data: self
593                    .child_data()
594                    .iter()
595                    .map(|data| data.slice(offset, length))
596                    .collect(),
597                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
598            }
599        } else {
600            let mut new_data = self.clone();
601
602            new_data.len = length;
603            new_data.offset = offset + self.offset;
604            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
605
606            new_data
607        }
608    }
609
610    /// Returns the `buffer` as a slice of type `T` starting at self.offset
611    ///
612    /// # Panics
613    /// This function panics if:
614    /// * the buffer is not byte-aligned with type T, or
615    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
616    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
617        &self.buffers()[buffer].typed_data()[self.offset..]
618    }
619
620    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
621    pub fn new_null(data_type: &DataType, len: usize) -> Self {
622        let bit_len = bit_util::ceil(len, 8);
623        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
624
625        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
626            Some(width) => (vec![zeroed(width * len)], vec![], true),
627            None => match data_type {
628                DataType::Null => (vec![], vec![], false),
629                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
630                DataType::Binary | DataType::Utf8 => {
631                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
632                }
633                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
634                DataType::LargeBinary | DataType::LargeUtf8 => {
635                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
636                }
637                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
638                DataType::List(f) | DataType::Map(f, _) => (
639                    vec![zeroed((len + 1) * 4)],
640                    vec![ArrayData::new_empty(f.data_type())],
641                    true,
642                ),
643                DataType::LargeList(f) => (
644                    vec![zeroed((len + 1) * 8)],
645                    vec![ArrayData::new_empty(f.data_type())],
646                    true,
647                ),
648                DataType::ListView(f) => (
649                    vec![zeroed(len * 4), zeroed(len * 4)],
650                    vec![ArrayData::new_empty(f.data_type())],
651                    true,
652                ),
653                DataType::LargeListView(f) => (
654                    vec![zeroed(len * 8), zeroed(len * 8)],
655                    vec![ArrayData::new_empty(f.data_type())],
656                    true,
657                ),
658                DataType::FixedSizeList(f, list_len) => (
659                    vec![],
660                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
661                    true,
662                ),
663                DataType::Struct(fields) => (
664                    vec![],
665                    fields
666                        .iter()
667                        .map(|f| Self::new_null(f.data_type(), len))
668                        .collect(),
669                    true,
670                ),
671                DataType::Dictionary(k, v) => (
672                    vec![zeroed(k.primitive_width().unwrap() * len)],
673                    vec![ArrayData::new_empty(v.as_ref())],
674                    true,
675                ),
676                DataType::Union(f, mode) => {
677                    let (id, _) = f.iter().next().unwrap();
678                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
679                    let buffers = match mode {
680                        UnionMode::Sparse => vec![ids],
681                        UnionMode::Dense => {
682                            let end_offset = i32::from_usize(len).unwrap();
683                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
684                        }
685                    };
686
687                    let children = f
688                        .iter()
689                        .enumerate()
690                        .map(|(idx, (_, f))| {
691                            if idx == 0 || *mode == UnionMode::Sparse {
692                                Self::new_null(f.data_type(), len)
693                            } else {
694                                Self::new_empty(f.data_type())
695                            }
696                        })
697                        .collect();
698
699                    (buffers, children, false)
700                }
701                DataType::RunEndEncoded(r, v) => {
702                    let runs = match r.data_type() {
703                        DataType::Int16 => {
704                            let i = i16::from_usize(len).expect("run overflow");
705                            Buffer::from_slice_ref([i])
706                        }
707                        DataType::Int32 => {
708                            let i = i32::from_usize(len).expect("run overflow");
709                            Buffer::from_slice_ref([i])
710                        }
711                        DataType::Int64 => {
712                            let i = i64::from_usize(len).expect("run overflow");
713                            Buffer::from_slice_ref([i])
714                        }
715                        dt => unreachable!("Invalid run ends data type {dt}"),
716                    };
717
718                    let builder = ArrayData::builder(r.data_type().clone())
719                        .len(1)
720                        .buffers(vec![runs]);
721
722                    // SAFETY:
723                    // Valid by construction
724                    let runs = unsafe { builder.build_unchecked() };
725                    (
726                        vec![],
727                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
728                        false,
729                    )
730                }
731                // Handled by Some(width) branch above
732                DataType::Int8
733                | DataType::Int16
734                | DataType::Int32
735                | DataType::Int64
736                | DataType::UInt8
737                | DataType::UInt16
738                | DataType::UInt32
739                | DataType::UInt64
740                | DataType::Float16
741                | DataType::Float32
742                | DataType::Float64
743                | DataType::Timestamp(_, _)
744                | DataType::Date32
745                | DataType::Date64
746                | DataType::Time32(_)
747                | DataType::Time64(_)
748                | DataType::Duration(_)
749                | DataType::Interval(_)
750                | DataType::Decimal32(_, _)
751                | DataType::Decimal64(_, _)
752                | DataType::Decimal128(_, _)
753                | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
754            },
755        };
756
757        let mut builder = ArrayDataBuilder::new(data_type.clone())
758            .len(len)
759            .buffers(buffers)
760            .child_data(child_data);
761
762        if has_nulls {
763            builder = builder.nulls(Some(NullBuffer::new_null(len)))
764        }
765
766        // SAFETY:
767        // Data valid by construction
768        unsafe { builder.build_unchecked() }
769    }
770
771    /// Returns a new empty [ArrayData] valid for `data_type`.
772    pub fn new_empty(data_type: &DataType) -> Self {
773        Self::new_null(data_type, 0)
774    }
775
776    /// Verifies that the buffers meet the minimum alignment requirements for the data type
777    ///
778    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
779    ///
780    /// This can be useful for when interacting with data sent over IPC or FFI, that may
781    /// not meet the minimum alignment requirements
782    ///
783    /// This also aligns buffers of children data
784    pub fn align_buffers(&mut self) {
785        let layout = layout(&self.data_type);
786        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
787            if let BufferSpec::FixedWidth { alignment, .. } = spec {
788                if buffer.as_ptr().align_offset(*alignment) != 0 {
789                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
790                }
791            }
792        }
793        // align children data recursively
794        for data in self.child_data.iter_mut() {
795            data.align_buffers()
796        }
797    }
798
799    /// "cheap" validation of an `ArrayData`. Ensures buffers are
800    /// sufficiently sized to store `len` + `offset` total elements of
801    /// `data_type` and performs other inexpensive consistency checks.
802    ///
803    /// This check is "cheap" in the sense that it does not validate the
804    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
805    /// are within the bounds of the values buffer).
806    ///
807    /// See [ArrayData::validate_data] to validate fully the offset content
808    /// and the validity of utf8 data
809    pub fn validate(&self) -> Result<(), ArrowError> {
810        // Need at least this mich space in each buffer
811        let len_plus_offset = self.len + self.offset;
812
813        // Check that the data layout conforms to the spec
814        let layout = layout(&self.data_type);
815
816        if !layout.can_contain_null_mask && self.nulls.is_some() {
817            return Err(ArrowError::InvalidArgumentError(format!(
818                "Arrays of type {:?} cannot contain a null bitmask",
819                self.data_type,
820            )));
821        }
822
823        // Check data buffers length for view types and other types
824        if self.buffers.len() < layout.buffers.len()
825            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
826        {
827            return Err(ArrowError::InvalidArgumentError(format!(
828                "Expected {} buffers in array of type {:?}, got {}",
829                layout.buffers.len(),
830                self.data_type,
831                self.buffers.len(),
832            )));
833        }
834
835        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
836            match spec {
837                BufferSpec::FixedWidth {
838                    byte_width,
839                    alignment,
840                } => {
841                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
842
843                    if buffer.len() < min_buffer_size {
844                        return Err(ArrowError::InvalidArgumentError(format!(
845                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
846                            min_buffer_size,
847                            i,
848                            self.data_type,
849                            buffer.len()
850                        )));
851                    }
852
853                    let align_offset = buffer.as_ptr().align_offset(*alignment);
854                    if align_offset != 0 {
855                        return Err(ArrowError::InvalidArgumentError(format!(
856                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
857                            self.data_type,
858                            align_offset.min(alignment - align_offset)
859                        )));
860                    }
861                }
862                BufferSpec::VariableWidth => {
863                    // not cheap to validate (need to look at the
864                    // data). Partially checked in validate_offsets
865                    // called below. Can check with `validate_full`
866                }
867                BufferSpec::BitMap => {
868                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
869                    if buffer.len() < min_buffer_size {
870                        return Err(ArrowError::InvalidArgumentError(format!(
871                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
872                            min_buffer_size,
873                            i,
874                            self.data_type,
875                            buffer.len()
876                        )));
877                    }
878                }
879                BufferSpec::AlwaysNull => {
880                    // Nothing to validate
881                }
882            }
883        }
884
885        // check null bit buffer size
886        if let Some(nulls) = self.nulls() {
887            if nulls.null_count() > self.len {
888                return Err(ArrowError::InvalidArgumentError(format!(
889                    "null_count {} for an array exceeds length of {} elements",
890                    nulls.null_count(),
891                    self.len
892                )));
893            }
894
895            let actual_len = nulls.validity().len();
896            let needed_len = bit_util::ceil(len_plus_offset, 8);
897            if actual_len < needed_len {
898                return Err(ArrowError::InvalidArgumentError(format!(
899                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
900                )));
901            }
902
903            if nulls.len() != self.len {
904                return Err(ArrowError::InvalidArgumentError(format!(
905                    "null buffer incorrect size. got {} expected {}",
906                    nulls.len(),
907                    self.len
908                )));
909            }
910        }
911
912        self.validate_child_data()?;
913
914        // Additional Type specific checks
915        match &self.data_type {
916            DataType::Utf8 | DataType::Binary => {
917                self.validate_offsets::<i32>(self.buffers[1].len())?;
918            }
919            DataType::LargeUtf8 | DataType::LargeBinary => {
920                self.validate_offsets::<i64>(self.buffers[1].len())?;
921            }
922            DataType::Dictionary(key_type, _value_type) => {
923                // At the moment, constructing a DictionaryArray will also check this
924                if !DataType::is_dictionary_key_type(key_type) {
925                    return Err(ArrowError::InvalidArgumentError(format!(
926                        "Dictionary key type must be integer, but was {key_type}"
927                    )));
928                }
929            }
930            DataType::RunEndEncoded(run_ends_type, _) => {
931                if run_ends_type.is_nullable() {
932                    return Err(ArrowError::InvalidArgumentError(
933                        "The nullable should be set to false for the field defining run_ends array.".to_string()
934                    ));
935                }
936                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
937                    return Err(ArrowError::InvalidArgumentError(format!(
938                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
939                        run_ends_type.data_type()
940                    )));
941                }
942            }
943            _ => {}
944        };
945
946        Ok(())
947    }
948
949    /// Returns a reference to the data in `buffer` as a typed slice
950    /// (typically `&[i32]` or `&[i64]`) after validating. The
951    /// returned slice is guaranteed to have at least `self.len + 1`
952    /// entries.
953    ///
954    /// For an empty array, the `buffer` can also be empty.
955    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
956        // An empty list-like array can have 0 offsets
957        if self.len == 0 && self.buffers[0].is_empty() {
958            return Ok(&[]);
959        }
960
961        self.typed_buffer(0, self.len + 1)
962    }
963
964    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
965    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
966        &self,
967        idx: usize,
968        len: usize,
969    ) -> Result<&[T], ArrowError> {
970        let buffer = &self.buffers[idx];
971
972        let required_len = (len + self.offset) * mem::size_of::<T>();
973
974        if buffer.len() < required_len {
975            return Err(ArrowError::InvalidArgumentError(format!(
976                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
977                idx,
978                self.data_type,
979                required_len,
980                buffer.len()
981            )));
982        }
983
984        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
985    }
986
987    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
988    /// offsets (of type T) into some other buffer of `values_length` bytes long
989    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
990        &self,
991        values_length: usize,
992    ) -> Result<(), ArrowError> {
993        // Justification: buffer size was validated above
994        let offsets = self.typed_offsets::<T>()?;
995        if offsets.is_empty() {
996            return Ok(());
997        }
998
999        let first_offset = offsets[0].to_usize().ok_or_else(|| {
1000            ArrowError::InvalidArgumentError(format!(
1001                "Error converting offset[0] ({}) to usize for {}",
1002                offsets[0], self.data_type
1003            ))
1004        })?;
1005
1006        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1007            ArrowError::InvalidArgumentError(format!(
1008                "Error converting offset[{}] ({}) to usize for {}",
1009                self.len, offsets[self.len], self.data_type
1010            ))
1011        })?;
1012
1013        if first_offset > values_length {
1014            return Err(ArrowError::InvalidArgumentError(format!(
1015                "First offset {} of {} is larger than values length {}",
1016                first_offset, self.data_type, values_length,
1017            )));
1018        }
1019
1020        if last_offset > values_length {
1021            return Err(ArrowError::InvalidArgumentError(format!(
1022                "Last offset {} of {} is larger than values length {}",
1023                last_offset, self.data_type, values_length,
1024            )));
1025        }
1026
1027        if first_offset > last_offset {
1028            return Err(ArrowError::InvalidArgumentError(format!(
1029                "First offset {} in {} is smaller than last offset {}",
1030                first_offset, self.data_type, last_offset,
1031            )));
1032        }
1033
1034        Ok(())
1035    }
1036
1037    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1038    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
1039    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1040        &self,
1041        values_length: usize,
1042    ) -> Result<(), ArrowError> {
1043        let offsets: &[T] = self.typed_buffer(0, self.len)?;
1044        let sizes: &[T] = self.typed_buffer(1, self.len)?;
1045        if offsets.len() != sizes.len() {
1046            return Err(ArrowError::ComputeError(format!(
1047                "ListView offsets len {} does not match sizes len {}",
1048                offsets.len(),
1049                sizes.len()
1050            )));
1051        }
1052
1053        for i in 0..sizes.len() {
1054            let size = sizes[i].to_usize().ok_or_else(|| {
1055                ArrowError::InvalidArgumentError(format!(
1056                    "Error converting size[{}] ({}) to usize for {}",
1057                    i, sizes[i], self.data_type
1058                ))
1059            })?;
1060            let offset = offsets[i].to_usize().ok_or_else(|| {
1061                ArrowError::InvalidArgumentError(format!(
1062                    "Error converting offset[{}] ({}) to usize for {}",
1063                    i, offsets[i], self.data_type
1064                ))
1065            })?;
1066            if size
1067                .checked_add(offset)
1068                .expect("Offset and size have exceeded the usize boundary")
1069                > values_length
1070            {
1071                return Err(ArrowError::InvalidArgumentError(format!(
1072                    "Size {} at index {} is larger than the remaining values for {}",
1073                    size, i, self.data_type
1074                )));
1075            }
1076        }
1077        Ok(())
1078    }
1079
1080    /// Validates the layout of `child_data` ArrayData structures
1081    fn validate_child_data(&self) -> Result<(), ArrowError> {
1082        match &self.data_type {
1083            DataType::List(field) | DataType::Map(field, _) => {
1084                let values_data = self.get_single_valid_child_data(field.data_type())?;
1085                self.validate_offsets::<i32>(values_data.len)?;
1086                Ok(())
1087            }
1088            DataType::LargeList(field) => {
1089                let values_data = self.get_single_valid_child_data(field.data_type())?;
1090                self.validate_offsets::<i64>(values_data.len)?;
1091                Ok(())
1092            }
1093            DataType::ListView(field) => {
1094                let values_data = self.get_single_valid_child_data(field.data_type())?;
1095                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1096                Ok(())
1097            }
1098            DataType::LargeListView(field) => {
1099                let values_data = self.get_single_valid_child_data(field.data_type())?;
1100                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1101                Ok(())
1102            }
1103            DataType::FixedSizeList(field, list_size) => {
1104                let values_data = self.get_single_valid_child_data(field.data_type())?;
1105
1106                let list_size: usize = (*list_size).try_into().map_err(|_| {
1107                    ArrowError::InvalidArgumentError(format!(
1108                        "{} has a negative list_size {}",
1109                        self.data_type, list_size
1110                    ))
1111                })?;
1112
1113                let expected_values_len = self.len
1114                    .checked_mul(list_size)
1115                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1116
1117                if values_data.len < expected_values_len {
1118                    return Err(ArrowError::InvalidArgumentError(format!(
1119                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1120                        values_data.len, self.len, list_size, self.data_type
1121                    )));
1122                }
1123
1124                Ok(())
1125            }
1126            DataType::Struct(fields) => {
1127                self.validate_num_child_data(fields.len())?;
1128                for (i, field) in fields.iter().enumerate() {
1129                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1130
1131                    // Ensure child field has sufficient size
1132                    if field_data.len < self.len {
1133                        return Err(ArrowError::InvalidArgumentError(format!(
1134                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1135                            self.data_type,
1136                            i,
1137                            field.name(),
1138                            field_data.len,
1139                            self.len
1140                        )));
1141                    }
1142                }
1143                Ok(())
1144            }
1145            DataType::RunEndEncoded(run_ends_field, values_field) => {
1146                self.validate_num_child_data(2)?;
1147                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1148                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1149                if run_ends_data.len != values_data.len {
1150                    return Err(ArrowError::InvalidArgumentError(format!(
1151                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1152                        run_ends_data.len, values_data.len
1153                    )));
1154                }
1155                if run_ends_data.nulls.is_some() {
1156                    return Err(ArrowError::InvalidArgumentError(
1157                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1158                    ));
1159                }
1160                Ok(())
1161            }
1162            DataType::Union(fields, mode) => {
1163                self.validate_num_child_data(fields.len())?;
1164
1165                for (i, (_, field)) in fields.iter().enumerate() {
1166                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1167
1168                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1169                        return Err(ArrowError::InvalidArgumentError(format!(
1170                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1171                            i,
1172                            field_data.len,
1173                            self.len + self.offset
1174                        )));
1175                    }
1176                }
1177                Ok(())
1178            }
1179            DataType::Dictionary(_key_type, value_type) => {
1180                self.get_single_valid_child_data(value_type)?;
1181                Ok(())
1182            }
1183            _ => {
1184                // other types do not have child data
1185                if !self.child_data.is_empty() {
1186                    return Err(ArrowError::InvalidArgumentError(format!(
1187                        "Expected no child arrays for type {} but got {}",
1188                        self.data_type,
1189                        self.child_data.len()
1190                    )));
1191                }
1192                Ok(())
1193            }
1194        }
1195    }
1196
1197    /// Ensures that this array data has a single child_data with the
1198    /// expected type, and calls `validate()` on it. Returns a
1199    /// reference to that child_data
1200    fn get_single_valid_child_data(
1201        &self,
1202        expected_type: &DataType,
1203    ) -> Result<&ArrayData, ArrowError> {
1204        self.validate_num_child_data(1)?;
1205        self.get_valid_child_data(0, expected_type)
1206    }
1207
1208    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1209    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1210        if self.child_data.len() != expected_len {
1211            Err(ArrowError::InvalidArgumentError(format!(
1212                "Value data for {} should contain {} child data array(s), had {}",
1213                self.data_type,
1214                expected_len,
1215                self.child_data.len()
1216            )))
1217        } else {
1218            Ok(())
1219        }
1220    }
1221
1222    /// Ensures that `child_data[i]` has the expected type, calls
1223    /// `validate()` on it, and returns a reference to that child_data
1224    fn get_valid_child_data(
1225        &self,
1226        i: usize,
1227        expected_type: &DataType,
1228    ) -> Result<&ArrayData, ArrowError> {
1229        let values_data = self.child_data.get(i).ok_or_else(|| {
1230            ArrowError::InvalidArgumentError(format!(
1231                "{} did not have enough child arrays. Expected at least {} but had only {}",
1232                self.data_type,
1233                i + 1,
1234                self.child_data.len()
1235            ))
1236        })?;
1237
1238        if expected_type != &values_data.data_type {
1239            return Err(ArrowError::InvalidArgumentError(format!(
1240                "Child type mismatch for {}. Expected {} but child data had {}",
1241                self.data_type, expected_type, values_data.data_type
1242            )));
1243        }
1244
1245        values_data.validate()?;
1246        Ok(values_data)
1247    }
1248
1249    /// Validate that the data contained within this [`ArrayData`] is valid
1250    ///
1251    /// 1. Null count is correct
1252    /// 2. All offsets are valid
1253    /// 3. All String data is valid UTF-8
1254    /// 4. All dictionary offsets are valid
1255    ///
1256    /// Internally this calls:
1257    ///
1258    /// * [`Self::validate`]
1259    /// * [`Self::validate_nulls`]
1260    /// * [`Self::validate_values`]
1261    ///
1262    /// Note: this does not recurse into children, for a recursive variant
1263    /// see [`Self::validate_full`]
1264    pub fn validate_data(&self) -> Result<(), ArrowError> {
1265        self.validate()?;
1266
1267        self.validate_nulls()?;
1268        self.validate_values()?;
1269        Ok(())
1270    }
1271
1272    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1273    ///
1274    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1275    /// and all its children recursively
1276    pub fn validate_full(&self) -> Result<(), ArrowError> {
1277        self.validate_data()?;
1278        // validate all children recursively
1279        self.child_data
1280            .iter()
1281            .enumerate()
1282            .try_for_each(|(i, child_data)| {
1283                child_data.validate_full().map_err(|e| {
1284                    ArrowError::InvalidArgumentError(format!(
1285                        "{} child #{} invalid: {}",
1286                        self.data_type, i, e
1287                    ))
1288                })
1289            })?;
1290        Ok(())
1291    }
1292
1293    /// Validates the values stored within this [`ArrayData`] are valid
1294    /// without recursing into child [`ArrayData`]
1295    ///
1296    /// Does not (yet) check
1297    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1298    /// 2. the the null count is correct and that any
1299    /// 3. nullability requirements of its children are correct
1300    ///
1301    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1302    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1303        if let Some(nulls) = &self.nulls {
1304            let actual = nulls.len() - nulls.inner().count_set_bits();
1305            if actual != nulls.null_count() {
1306                return Err(ArrowError::InvalidArgumentError(format!(
1307                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1308                    nulls.null_count(),
1309                    actual
1310                )));
1311            }
1312        }
1313
1314        // In general non-nullable children should not contain nulls, however, for certain
1315        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1316        // space in the child. As such we permit nulls in the children in the corresponding
1317        // positions for such types
1318        match &self.data_type {
1319            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1320                if !f.is_nullable() {
1321                    self.validate_non_nullable(None, &self.child_data[0])?
1322                }
1323            }
1324            DataType::FixedSizeList(field, len) => {
1325                let child = &self.child_data[0];
1326                if !field.is_nullable() {
1327                    match &self.nulls {
1328                        Some(nulls) => {
1329                            let element_len = *len as usize;
1330                            let expanded = nulls.expand(element_len);
1331                            self.validate_non_nullable(Some(&expanded), child)?;
1332                        }
1333                        None => self.validate_non_nullable(None, child)?,
1334                    }
1335                }
1336            }
1337            DataType::Struct(fields) => {
1338                for (field, child) in fields.iter().zip(&self.child_data) {
1339                    if !field.is_nullable() {
1340                        self.validate_non_nullable(self.nulls(), child)?
1341                    }
1342                }
1343            }
1344            _ => {}
1345        }
1346
1347        Ok(())
1348    }
1349
1350    /// Verifies that `child` contains no nulls not present in `mask`
1351    fn validate_non_nullable(
1352        &self,
1353        mask: Option<&NullBuffer>,
1354        child: &ArrayData,
1355    ) -> Result<(), ArrowError> {
1356        let mask = match mask {
1357            Some(mask) => mask,
1358            None => {
1359                return match child.null_count() {
1360                    0 => Ok(()),
1361                    _ => Err(ArrowError::InvalidArgumentError(format!(
1362                        "non-nullable child of type {} contains nulls not present in parent {}",
1363                        child.data_type, self.data_type
1364                    ))),
1365                };
1366            }
1367        };
1368
1369        match child.nulls() {
1370            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1371                "non-nullable child of type {} contains nulls not present in parent",
1372                child.data_type
1373            ))),
1374            _ => Ok(()),
1375        }
1376    }
1377
1378    /// Validates the values stored within this [`ArrayData`] are valid
1379    /// without recursing into child [`ArrayData`]
1380    ///
1381    /// Does not (yet) check
1382    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1383    pub fn validate_values(&self) -> Result<(), ArrowError> {
1384        match &self.data_type {
1385            DataType::Utf8 => self.validate_utf8::<i32>(),
1386            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1387            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1388            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1389            DataType::BinaryView => {
1390                let views = self.typed_buffer::<u128>(0, self.len)?;
1391                validate_binary_view(views, &self.buffers[1..])
1392            }
1393            DataType::Utf8View => {
1394                let views = self.typed_buffer::<u128>(0, self.len)?;
1395                validate_string_view(views, &self.buffers[1..])
1396            }
1397            DataType::List(_) | DataType::Map(_, _) => {
1398                let child = &self.child_data[0];
1399                self.validate_offsets_full::<i32>(child.len)
1400            }
1401            DataType::LargeList(_) => {
1402                let child = &self.child_data[0];
1403                self.validate_offsets_full::<i64>(child.len)
1404            }
1405            DataType::Union(_, _) => {
1406                // Validate Union Array as part of implementing new Union semantics
1407                // See comments in `ArrayData::validate()`
1408                // https://github.com/apache/arrow-rs/issues/85
1409                //
1410                // TODO file follow on ticket for full union validation
1411                Ok(())
1412            }
1413            DataType::Dictionary(key_type, _value_type) => {
1414                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1415                let max_value = dictionary_length - 1;
1416                match key_type.as_ref() {
1417                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1418                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1419                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1420                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1421                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1422                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1423                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1424                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1425                    _ => unreachable!(),
1426                }
1427            }
1428            DataType::RunEndEncoded(run_ends, _values) => {
1429                let run_ends_data = self.child_data()[0].clone();
1430                match run_ends.data_type() {
1431                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1432                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1433                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1434                    _ => unreachable!(),
1435                }
1436            }
1437            _ => {
1438                // No extra validation check required for other types
1439                Ok(())
1440            }
1441        }
1442    }
1443
1444    /// Calls the `validate(item_index, range)` function for each of
1445    /// the ranges specified in the arrow offsets buffer of type
1446    /// `T`. Also validates that each offset is smaller than
1447    /// `offset_limit`
1448    ///
1449    /// For an empty array, the offsets buffer can either be empty
1450    /// or contain a single `0`.
1451    ///
1452    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1453    /// function would call `validate([1,2])`, and `validate([2,4])`
1454    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1455    where
1456        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1457        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1458    {
1459        self.typed_offsets::<T>()?
1460            .iter()
1461            .enumerate()
1462            .map(|(i, x)| {
1463                // check if the offset can be converted to usize
1464                let r = x.to_usize().ok_or_else(|| {
1465                    ArrowError::InvalidArgumentError(format!(
1466                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1467                    );
1468                // check if the offset exceeds the limit
1469                match r {
1470                    Ok(n) if n <= offset_limit => Ok((i, n)),
1471                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1472                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1473                    ),
1474                    Err(e) => Err(e),
1475                }
1476            })
1477            .scan(0_usize, |start, end| {
1478                // check offsets are monotonically increasing
1479                match end {
1480                    Ok((i, end)) if *start <= end => {
1481                        let range = Some(Ok((i, *start..end)));
1482                        *start = end;
1483                        range
1484                    }
1485                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1486                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1487                        i - 1, start, end))
1488                    )),
1489                    Err(err) => Some(Err(err)),
1490                }
1491            })
1492            .skip(1) // the first element is meaningless
1493            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1494                let (item_index, range) = res?;
1495                validate(item_index-1, range)
1496            })
1497    }
1498
1499    /// Ensures that all strings formed by the offsets in `buffers[0]`
1500    /// into `buffers[1]` are valid utf8 sequences
1501    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1502    where
1503        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1504    {
1505        let values_buffer = &self.buffers[1].as_slice();
1506        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1507            // Validate Offsets are correct
1508            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1509                if !values_str.is_char_boundary(range.start)
1510                    || !values_str.is_char_boundary(range.end)
1511                {
1512                    return Err(ArrowError::InvalidArgumentError(format!(
1513                        "incomplete utf-8 byte sequence from index {string_index}"
1514                    )));
1515                }
1516                Ok(())
1517            })
1518        } else {
1519            // find specific offset that failed utf8 validation
1520            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1521                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1522                    ArrowError::InvalidArgumentError(format!(
1523                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1524                    ))
1525                })?;
1526                Ok(())
1527            })
1528        }
1529    }
1530
1531    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1532    /// between `0` and `offset_limit`
1533    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1534    where
1535        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1536    {
1537        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1538            // No validation applied to each value, but the iteration
1539            // itself applies bounds checking to each range
1540            Ok(())
1541        })
1542    }
1543
1544    /// Validates that each value in self.buffers (typed as T)
1545    /// is within the range [0, max_value], inclusive
1546    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1547    where
1548        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1549    {
1550        let required_len = self.len + self.offset;
1551        let buffer = &self.buffers[0];
1552
1553        // This should have been checked as part of `validate()` prior
1554        // to calling `validate_full()` but double check to be sure
1555        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1556
1557        // Justification: buffer size was validated above
1558        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1559
1560        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1561            // Do not check the value is null (value can be arbitrary)
1562            if self.is_null(i) {
1563                return Ok(());
1564            }
1565            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1566                ArrowError::InvalidArgumentError(format!(
1567                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1568                ))
1569            })?;
1570
1571            if dict_index < 0 || dict_index > max_value {
1572                return Err(ArrowError::InvalidArgumentError(format!(
1573                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1574                )));
1575            }
1576            Ok(())
1577        })
1578    }
1579
1580    /// Validates that each value in run_ends array is positive and strictly increasing.
1581    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1582    where
1583        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1584    {
1585        let values = self.typed_buffer::<T>(0, self.len)?;
1586        let mut prev_value: i64 = 0_i64;
1587        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1588            let value: i64 = inp_value.try_into().map_err(|_| {
1589                ArrowError::InvalidArgumentError(format!(
1590                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1591                ))
1592            })?;
1593            if value <= 0_i64 {
1594                return Err(ArrowError::InvalidArgumentError(format!(
1595                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1596                )));
1597            }
1598            if ix > 0 && value <= prev_value {
1599                return Err(ArrowError::InvalidArgumentError(format!(
1600                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1601                )));
1602            }
1603
1604            prev_value = value;
1605            Ok(())
1606        })?;
1607
1608        if prev_value.as_usize() < (self.offset + self.len) {
1609            return Err(ArrowError::InvalidArgumentError(format!(
1610                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1611                self.offset + self.len
1612            )));
1613        }
1614        Ok(())
1615    }
1616
1617    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1618    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1619    /// return false when the arrays are logically equal
1620    pub fn ptr_eq(&self, other: &Self) -> bool {
1621        if self.offset != other.offset
1622            || self.len != other.len
1623            || self.data_type != other.data_type
1624            || self.buffers.len() != other.buffers.len()
1625            || self.child_data.len() != other.child_data.len()
1626        {
1627            return false;
1628        }
1629
1630        match (&self.nulls, &other.nulls) {
1631            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1632            (Some(_), None) | (None, Some(_)) => return false,
1633            _ => {}
1634        };
1635
1636        if !self
1637            .buffers
1638            .iter()
1639            .zip(other.buffers.iter())
1640            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1641        {
1642            return false;
1643        }
1644
1645        self.child_data
1646            .iter()
1647            .zip(other.child_data.iter())
1648            .all(|(a, b)| a.ptr_eq(b))
1649    }
1650
1651    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1652    pub fn into_builder(self) -> ArrayDataBuilder {
1653        self.into()
1654    }
1655}
1656
1657/// Return the expected [`DataTypeLayout`] Arrays of this data
1658/// type are expected to have
1659pub fn layout(data_type: &DataType) -> DataTypeLayout {
1660    // based on C/C++ implementation in
1661    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1662    use arrow_schema::IntervalUnit::*;
1663
1664    match data_type {
1665        DataType::Null => DataTypeLayout {
1666            buffers: vec![],
1667            can_contain_null_mask: false,
1668            variadic: false,
1669        },
1670        DataType::Boolean => DataTypeLayout {
1671            buffers: vec![BufferSpec::BitMap],
1672            can_contain_null_mask: true,
1673            variadic: false,
1674        },
1675        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1676        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1677        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1678        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1679        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1680        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1681        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1682        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1683        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1684        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1685        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1686        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1687        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1688        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1689        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1690        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1691        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1692        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1693        DataType::Interval(MonthDayNano) => {
1694            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1695        }
1696        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1697        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1698        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1699        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1700        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1701        DataType::FixedSizeBinary(size) => {
1702            let spec = BufferSpec::FixedWidth {
1703                byte_width: (*size).try_into().unwrap(),
1704                alignment: mem::align_of::<u8>(),
1705            };
1706            DataTypeLayout {
1707                buffers: vec![spec],
1708                can_contain_null_mask: true,
1709                variadic: false,
1710            }
1711        }
1712        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1713        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1714        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1715        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1716        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1717        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1718        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1719        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1720        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1721        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1722        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1723        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1724        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1725        DataType::Union(_, mode) => {
1726            let type_ids = BufferSpec::FixedWidth {
1727                byte_width: mem::size_of::<i8>(),
1728                alignment: mem::align_of::<i8>(),
1729            };
1730
1731            DataTypeLayout {
1732                buffers: match mode {
1733                    UnionMode::Sparse => {
1734                        vec![type_ids]
1735                    }
1736                    UnionMode::Dense => {
1737                        vec![
1738                            type_ids,
1739                            BufferSpec::FixedWidth {
1740                                byte_width: mem::size_of::<i32>(),
1741                                alignment: mem::align_of::<i32>(),
1742                            },
1743                        ]
1744                    }
1745                },
1746                can_contain_null_mask: false,
1747                variadic: false,
1748            }
1749        }
1750        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1751    }
1752}
1753
1754/// Layout specification for a data type
1755#[derive(Debug, PartialEq, Eq)]
1756// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1757pub struct DataTypeLayout {
1758    /// A vector of buffer layout specifications, one for each expected buffer
1759    pub buffers: Vec<BufferSpec>,
1760
1761    /// Can contain a null bitmask
1762    pub can_contain_null_mask: bool,
1763
1764    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1765    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1766    /// buffers.len(). Buffers that exceed the lower bound are legal.
1767    pub variadic: bool,
1768}
1769
1770impl DataTypeLayout {
1771    /// Describes a basic numeric array where each element has type `T`
1772    pub fn new_fixed_width<T>() -> Self {
1773        Self {
1774            buffers: vec![BufferSpec::FixedWidth {
1775                byte_width: mem::size_of::<T>(),
1776                alignment: mem::align_of::<T>(),
1777            }],
1778            can_contain_null_mask: true,
1779            variadic: false,
1780        }
1781    }
1782
1783    /// Describes arrays which have no data of their own
1784    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1785    pub fn new_nullable_empty() -> Self {
1786        Self {
1787            buffers: vec![],
1788            can_contain_null_mask: true,
1789            variadic: false,
1790        }
1791    }
1792
1793    /// Describes arrays which have no data of their own
1794    /// (e.g. RunEndEncoded).
1795    pub fn new_empty() -> Self {
1796        Self {
1797            buffers: vec![],
1798            can_contain_null_mask: false,
1799            variadic: false,
1800        }
1801    }
1802
1803    /// Describes a basic numeric array where each element has a fixed
1804    /// with offset buffer of type `T`, followed by a
1805    /// variable width data buffer
1806    pub fn new_binary<T>() -> Self {
1807        Self {
1808            buffers: vec![
1809                // offsets
1810                BufferSpec::FixedWidth {
1811                    byte_width: mem::size_of::<T>(),
1812                    alignment: mem::align_of::<T>(),
1813                },
1814                // values
1815                BufferSpec::VariableWidth,
1816            ],
1817            can_contain_null_mask: true,
1818            variadic: false,
1819        }
1820    }
1821
1822    /// Describes a view type
1823    pub fn new_view() -> Self {
1824        Self {
1825            buffers: vec![BufferSpec::FixedWidth {
1826                byte_width: mem::size_of::<u128>(),
1827                alignment: mem::align_of::<u128>(),
1828            }],
1829            can_contain_null_mask: true,
1830            variadic: true,
1831        }
1832    }
1833
1834    /// Describes a list view type
1835    pub fn new_list_view<T>() -> Self {
1836        Self {
1837            buffers: vec![
1838                BufferSpec::FixedWidth {
1839                    byte_width: mem::size_of::<T>(),
1840                    alignment: mem::align_of::<T>(),
1841                },
1842                BufferSpec::FixedWidth {
1843                    byte_width: mem::size_of::<T>(),
1844                    alignment: mem::align_of::<T>(),
1845                },
1846            ],
1847            can_contain_null_mask: true,
1848            variadic: false,
1849        }
1850    }
1851}
1852
1853/// Layout specification for a single data type buffer
1854#[derive(Debug, PartialEq, Eq)]
1855pub enum BufferSpec {
1856    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1857    ///
1858    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1859    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1860    ///
1861    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1862    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1863    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1864    ///
1865    /// Note that these alignment requirements will vary between architectures
1866    FixedWidth {
1867        /// The width of each element in bytes
1868        byte_width: usize,
1869        /// The alignment required by Rust for an array of the corresponding primitive
1870        alignment: usize,
1871    },
1872    /// Variable width, such as string data for utf8 data
1873    VariableWidth,
1874    /// Buffer holds a bitmap.
1875    ///
1876    /// Note: Unlike the C++ implementation, the null/validity buffer
1877    /// is handled specially rather than as another of the buffers in
1878    /// the spec, so this variant is only used for the Boolean type.
1879    BitMap,
1880    /// Buffer is always null. Unused currently in Rust implementation,
1881    /// (used in C++ for Union type)
1882    #[allow(dead_code)]
1883    AlwaysNull,
1884}
1885
1886impl PartialEq for ArrayData {
1887    fn eq(&self, other: &Self) -> bool {
1888        equal::equal(self, other)
1889    }
1890}
1891
1892/// A boolean flag that cannot be mutated outside of unsafe code.
1893///
1894/// Defaults to a value of false.
1895///
1896/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1897///
1898/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1899///
1900/// # Example
1901/// ```rust
1902/// use arrow_data::UnsafeFlag;
1903/// assert!(!UnsafeFlag::default().get()); // default is false
1904/// let mut flag = UnsafeFlag::new();
1905/// assert!(!flag.get()); // defaults to false
1906/// // can only set it to true in unsafe code
1907/// unsafe { flag.set(true) };
1908/// assert!(flag.get()); // now true
1909/// ```
1910#[derive(Debug, Clone)]
1911#[doc(hidden)]
1912pub struct UnsafeFlag(bool);
1913
1914impl UnsafeFlag {
1915    /// Creates a new `UnsafeFlag` with the value set to `false`.
1916    ///
1917    /// See examples on [`Self::new`]
1918    #[inline]
1919    pub const fn new() -> Self {
1920        Self(false)
1921    }
1922
1923    /// Sets the value of the flag to the given value
1924    ///
1925    /// Note this can purposely only be done in `unsafe` code
1926    ///
1927    /// # Safety
1928    ///
1929    /// If set, the flag will be set to the given value. There is nothing
1930    /// immediately unsafe about doing so, however, the flag can be used to
1931    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1932    #[inline]
1933    pub unsafe fn set(&mut self, val: bool) {
1934        self.0 = val;
1935    }
1936
1937    /// Returns the value of the flag
1938    #[inline]
1939    pub fn get(&self) -> bool {
1940        self.0
1941    }
1942}
1943
1944// Manual impl to make it clear you can not construct unsafe with true
1945impl Default for UnsafeFlag {
1946    fn default() -> Self {
1947        Self::new()
1948    }
1949}
1950
1951/// Builder for [`ArrayData`] type
1952#[derive(Debug)]
1953pub struct ArrayDataBuilder {
1954    data_type: DataType,
1955    len: usize,
1956    null_count: Option<usize>,
1957    null_bit_buffer: Option<Buffer>,
1958    nulls: Option<NullBuffer>,
1959    offset: usize,
1960    buffers: Vec<Buffer>,
1961    child_data: Vec<ArrayData>,
1962    /// Should buffers be realigned (copying if necessary)?
1963    ///
1964    /// Defaults to false.
1965    align_buffers: bool,
1966    /// Should data validation be skipped for this [`ArrayData`]?
1967    ///
1968    /// Defaults to false.
1969    ///
1970    /// # Safety
1971    ///
1972    /// This flag can only be set to true using `unsafe` APIs. However, once true
1973    /// subsequent calls to `build()` may result in undefined behavior if the data
1974    /// is not valid.
1975    skip_validation: UnsafeFlag,
1976}
1977
1978impl ArrayDataBuilder {
1979    #[inline]
1980    /// Creates a new array data builder
1981    pub const fn new(data_type: DataType) -> Self {
1982        Self {
1983            data_type,
1984            len: 0,
1985            null_count: None,
1986            null_bit_buffer: None,
1987            nulls: None,
1988            offset: 0,
1989            buffers: vec![],
1990            child_data: vec![],
1991            align_buffers: false,
1992            skip_validation: UnsafeFlag::new(),
1993        }
1994    }
1995
1996    /// Creates a new array data builder from an existing one, changing the data type
1997    pub fn data_type(self, data_type: DataType) -> Self {
1998        Self { data_type, ..self }
1999    }
2000
2001    #[inline]
2002    #[allow(clippy::len_without_is_empty)]
2003    /// Sets the length of the [ArrayData]
2004    pub const fn len(mut self, n: usize) -> Self {
2005        self.len = n;
2006        self
2007    }
2008
2009    /// Sets the null buffer of the [ArrayData]
2010    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2011        self.nulls = nulls;
2012        self.null_count = None;
2013        self.null_bit_buffer = None;
2014        self
2015    }
2016
2017    /// Sets the null count of the [ArrayData]
2018    pub fn null_count(mut self, null_count: usize) -> Self {
2019        self.null_count = Some(null_count);
2020        self
2021    }
2022
2023    /// Sets the `null_bit_buffer` of the [ArrayData]
2024    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2025        self.nulls = None;
2026        self.null_bit_buffer = buf;
2027        self
2028    }
2029
2030    /// Sets the offset of the [ArrayData]
2031    #[inline]
2032    pub const fn offset(mut self, n: usize) -> Self {
2033        self.offset = n;
2034        self
2035    }
2036
2037    /// Sets the buffers of the [ArrayData]
2038    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2039        self.buffers = v;
2040        self
2041    }
2042
2043    /// Adds a single buffer to the [ArrayData]'s buffers
2044    pub fn add_buffer(mut self, b: Buffer) -> Self {
2045        self.buffers.push(b);
2046        self
2047    }
2048
2049    /// Adds multiple buffers to the [ArrayData]'s buffers
2050    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2051        self.buffers.extend(bs);
2052        self
2053    }
2054
2055    /// Sets the child data of the [ArrayData]
2056    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2057        self.child_data = v;
2058        self
2059    }
2060
2061    /// Adds a single child data to the [ArrayData]'s child data
2062    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2063        self.child_data.push(r);
2064        self
2065    }
2066
2067    /// Creates an array data, without any validation
2068    ///
2069    /// Note: This is shorthand for
2070    /// ```rust
2071    /// # #[expect(unsafe_op_in_unsafe_fn)]
2072    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2073    /// # let _ = unsafe {
2074    /// builder.skip_validation(true).build().unwrap()
2075    /// # };
2076    /// ```
2077    ///
2078    /// # Safety
2079    ///
2080    /// The same caveats as [`ArrayData::new_unchecked`]
2081    /// apply.
2082    pub unsafe fn build_unchecked(self) -> ArrayData {
2083        unsafe { self.skip_validation(true) }.build().unwrap()
2084    }
2085
2086    /// Creates an `ArrayData`, consuming `self`
2087    ///
2088    /// # Safety
2089    ///
2090    /// By default the underlying buffers are checked to ensure they are valid
2091    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2092    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2093    /// not valid, undefined behavior will result.
2094    pub fn build(self) -> Result<ArrayData, ArrowError> {
2095        let Self {
2096            data_type,
2097            len,
2098            null_count,
2099            null_bit_buffer,
2100            nulls,
2101            offset,
2102            buffers,
2103            child_data,
2104            align_buffers,
2105            skip_validation,
2106        } = self;
2107
2108        let nulls = nulls
2109            .or_else(|| {
2110                let buffer = null_bit_buffer?;
2111                let buffer = BooleanBuffer::new(buffer, offset, len);
2112                Some(match null_count {
2113                    Some(n) => {
2114                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2115                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2116                    }
2117                    None => NullBuffer::new(buffer),
2118                })
2119            })
2120            .filter(|b| b.null_count() != 0);
2121
2122        let mut data = ArrayData {
2123            data_type,
2124            len,
2125            offset,
2126            buffers,
2127            child_data,
2128            nulls,
2129        };
2130
2131        if align_buffers {
2132            data.align_buffers();
2133        }
2134
2135        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2136        if !skip_validation.get() || cfg!(feature = "force_validate") {
2137            data.validate_data()?;
2138        }
2139        Ok(data)
2140    }
2141
2142    /// Creates an array data, validating all inputs, and aligning any buffers
2143    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2144    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2145        self.align_buffers(true).build()
2146    }
2147
2148    /// Ensure that all buffers are aligned, copying data if necessary
2149    ///
2150    /// Rust requires that arrays are aligned to their corresponding primitive,
2151    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2152    ///
2153    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2154    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2155    ///
2156    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2157    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2158    /// when necessary, making it useful when interacting with buffers produced by other systems,
2159    /// e.g. IPC or FFI.
2160    ///
2161    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2162    /// insufficiently aligned buffers.
2163    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2164        self.align_buffers = align_buffers;
2165        self
2166    }
2167
2168    /// Skips validation of the data.
2169    ///
2170    /// If this flag is enabled, `[Self::build`] will skip validation of the
2171    /// data
2172    ///
2173    /// If this flag is not enabled, `[Self::build`] will validate that all
2174    /// buffers are valid and will return an error if any data is invalid.
2175    /// Validation can be expensive.
2176    ///
2177    /// # Safety
2178    ///
2179    /// If validation is skipped, the buffers must form a valid Arrow array,
2180    /// otherwise undefined behavior will result
2181    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2182        unsafe {
2183            self.skip_validation.set(skip_validation);
2184        }
2185        self
2186    }
2187}
2188
2189impl From<ArrayData> for ArrayDataBuilder {
2190    fn from(d: ArrayData) -> Self {
2191        Self {
2192            data_type: d.data_type,
2193            len: d.len,
2194            offset: d.offset,
2195            buffers: d.buffers,
2196            child_data: d.child_data,
2197            nulls: d.nulls,
2198            null_bit_buffer: None,
2199            null_count: None,
2200            align_buffers: false,
2201            skip_validation: UnsafeFlag::new(),
2202        }
2203    }
2204}
2205
2206#[cfg(test)]
2207mod tests {
2208    use super::*;
2209    use arrow_schema::{Field, Fields};
2210
2211    // See arrow/tests/array_data_validation.rs for test of array validation
2212
2213    /// returns a buffer initialized with some constant value for tests
2214    fn make_i32_buffer(n: usize) -> Buffer {
2215        Buffer::from_slice_ref(vec![42i32; n])
2216    }
2217
2218    /// returns a buffer initialized with some constant value for tests
2219    fn make_f32_buffer(n: usize) -> Buffer {
2220        Buffer::from_slice_ref(vec![42f32; n])
2221    }
2222
2223    #[test]
2224    fn test_builder() {
2225        // Buffer needs to be at least 25 long
2226        let v = (0..25).collect::<Vec<i32>>();
2227        let b1 = Buffer::from_slice_ref(&v);
2228        let arr_data = ArrayData::builder(DataType::Int32)
2229            .len(20)
2230            .offset(5)
2231            .add_buffer(b1)
2232            .null_bit_buffer(Some(Buffer::from([
2233                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2234            ])))
2235            .build()
2236            .unwrap();
2237
2238        assert_eq!(20, arr_data.len());
2239        assert_eq!(10, arr_data.null_count());
2240        assert_eq!(5, arr_data.offset());
2241        assert_eq!(1, arr_data.buffers().len());
2242        assert_eq!(
2243            Buffer::from_slice_ref(&v).as_slice(),
2244            arr_data.buffers()[0].as_slice()
2245        );
2246    }
2247
2248    #[test]
2249    fn test_builder_with_child_data() {
2250        let child_arr_data = ArrayData::try_new(
2251            DataType::Int32,
2252            5,
2253            None,
2254            0,
2255            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2256            vec![],
2257        )
2258        .unwrap();
2259
2260        let field = Arc::new(Field::new("x", DataType::Int32, true));
2261        let data_type = DataType::Struct(vec![field].into());
2262
2263        let arr_data = ArrayData::builder(data_type)
2264            .len(5)
2265            .offset(0)
2266            .add_child_data(child_arr_data.clone())
2267            .build()
2268            .unwrap();
2269
2270        assert_eq!(5, arr_data.len());
2271        assert_eq!(1, arr_data.child_data().len());
2272        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2273    }
2274
2275    #[test]
2276    fn test_null_count() {
2277        let mut bit_v: [u8; 2] = [0; 2];
2278        bit_util::set_bit(&mut bit_v, 0);
2279        bit_util::set_bit(&mut bit_v, 3);
2280        bit_util::set_bit(&mut bit_v, 10);
2281        let arr_data = ArrayData::builder(DataType::Int32)
2282            .len(16)
2283            .add_buffer(make_i32_buffer(16))
2284            .null_bit_buffer(Some(Buffer::from(bit_v)))
2285            .build()
2286            .unwrap();
2287        assert_eq!(13, arr_data.null_count());
2288
2289        // Test with offset
2290        let mut bit_v: [u8; 2] = [0; 2];
2291        bit_util::set_bit(&mut bit_v, 0);
2292        bit_util::set_bit(&mut bit_v, 3);
2293        bit_util::set_bit(&mut bit_v, 10);
2294        let arr_data = ArrayData::builder(DataType::Int32)
2295            .len(12)
2296            .offset(2)
2297            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2298            .null_bit_buffer(Some(Buffer::from(bit_v)))
2299            .build()
2300            .unwrap();
2301        assert_eq!(10, arr_data.null_count());
2302    }
2303
2304    #[test]
2305    fn test_null_buffer_ref() {
2306        let mut bit_v: [u8; 2] = [0; 2];
2307        bit_util::set_bit(&mut bit_v, 0);
2308        bit_util::set_bit(&mut bit_v, 3);
2309        bit_util::set_bit(&mut bit_v, 10);
2310        let arr_data = ArrayData::builder(DataType::Int32)
2311            .len(16)
2312            .add_buffer(make_i32_buffer(16))
2313            .null_bit_buffer(Some(Buffer::from(bit_v)))
2314            .build()
2315            .unwrap();
2316        assert!(arr_data.nulls().is_some());
2317        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2318    }
2319
2320    #[test]
2321    fn test_slice() {
2322        let mut bit_v: [u8; 2] = [0; 2];
2323        bit_util::set_bit(&mut bit_v, 0);
2324        bit_util::set_bit(&mut bit_v, 3);
2325        bit_util::set_bit(&mut bit_v, 10);
2326        let data = ArrayData::builder(DataType::Int32)
2327            .len(16)
2328            .add_buffer(make_i32_buffer(16))
2329            .null_bit_buffer(Some(Buffer::from(bit_v)))
2330            .build()
2331            .unwrap();
2332        let new_data = data.slice(1, 15);
2333        assert_eq!(data.len() - 1, new_data.len());
2334        assert_eq!(1, new_data.offset());
2335        assert_eq!(data.null_count(), new_data.null_count());
2336
2337        // slice of a slice (removes one null)
2338        let new_data = new_data.slice(1, 14);
2339        assert_eq!(data.len() - 2, new_data.len());
2340        assert_eq!(2, new_data.offset());
2341        assert_eq!(data.null_count() - 1, new_data.null_count());
2342    }
2343
2344    #[test]
2345    fn test_equality() {
2346        let int_data = ArrayData::builder(DataType::Int32)
2347            .len(1)
2348            .add_buffer(make_i32_buffer(1))
2349            .build()
2350            .unwrap();
2351
2352        let float_data = ArrayData::builder(DataType::Float32)
2353            .len(1)
2354            .add_buffer(make_f32_buffer(1))
2355            .build()
2356            .unwrap();
2357        assert_ne!(int_data, float_data);
2358        assert!(!int_data.ptr_eq(&float_data));
2359        assert!(int_data.ptr_eq(&int_data));
2360
2361        #[allow(clippy::redundant_clone)]
2362        let int_data_clone = int_data.clone();
2363        assert_eq!(int_data, int_data_clone);
2364        assert!(int_data.ptr_eq(&int_data_clone));
2365        assert!(int_data_clone.ptr_eq(&int_data));
2366
2367        let int_data_slice = int_data_clone.slice(1, 0);
2368        assert!(int_data_slice.ptr_eq(&int_data_slice));
2369        assert!(!int_data.ptr_eq(&int_data_slice));
2370        assert!(!int_data_slice.ptr_eq(&int_data));
2371
2372        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2373        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2374        let string_data = ArrayData::try_new(
2375            DataType::Utf8,
2376            3,
2377            Some(Buffer::from_iter(vec![true, false, true])),
2378            0,
2379            vec![offsets_buffer, data_buffer],
2380            vec![],
2381        )
2382        .unwrap();
2383
2384        assert_ne!(float_data, string_data);
2385        assert!(!float_data.ptr_eq(&string_data));
2386
2387        assert!(string_data.ptr_eq(&string_data));
2388
2389        #[allow(clippy::redundant_clone)]
2390        let string_data_cloned = string_data.clone();
2391        assert!(string_data_cloned.ptr_eq(&string_data));
2392        assert!(string_data.ptr_eq(&string_data_cloned));
2393
2394        let string_data_slice = string_data.slice(1, 2);
2395        assert!(string_data_slice.ptr_eq(&string_data_slice));
2396        assert!(!string_data_slice.ptr_eq(&string_data))
2397    }
2398
2399    #[test]
2400    fn test_slice_memory_size() {
2401        let mut bit_v: [u8; 2] = [0; 2];
2402        bit_util::set_bit(&mut bit_v, 0);
2403        bit_util::set_bit(&mut bit_v, 3);
2404        bit_util::set_bit(&mut bit_v, 10);
2405        let data = ArrayData::builder(DataType::Int32)
2406            .len(16)
2407            .add_buffer(make_i32_buffer(16))
2408            .null_bit_buffer(Some(Buffer::from(bit_v)))
2409            .build()
2410            .unwrap();
2411        let new_data = data.slice(1, 14);
2412        assert_eq!(
2413            data.get_slice_memory_size().unwrap() - 8,
2414            new_data.get_slice_memory_size().unwrap()
2415        );
2416        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2417        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2418        let string_data = ArrayData::try_new(
2419            DataType::Utf8,
2420            3,
2421            Some(Buffer::from_iter(vec![true, false, true])),
2422            0,
2423            vec![offsets_buffer, data_buffer],
2424            vec![],
2425        )
2426        .unwrap();
2427        let string_data_slice = string_data.slice(1, 2);
2428        //4 bytes of offset and 2 bytes of data reduced by slicing.
2429        assert_eq!(
2430            string_data.get_slice_memory_size().unwrap() - 6,
2431            string_data_slice.get_slice_memory_size().unwrap()
2432        );
2433    }
2434
2435    #[test]
2436    fn test_count_nulls() {
2437        let buffer = Buffer::from([0b00010110, 0b10011111]);
2438        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2439        let count = count_nulls(Some(&buffer), 0, 16);
2440        assert_eq!(count, 7);
2441
2442        let count = count_nulls(Some(&buffer), 4, 8);
2443        assert_eq!(count, 3);
2444    }
2445
2446    #[test]
2447    fn test_contains_nulls() {
2448        let buffer: Buffer =
2449            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2450        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2451        assert!(contains_nulls(Some(&buffer), 0, 6));
2452        assert!(contains_nulls(Some(&buffer), 0, 3));
2453        assert!(!contains_nulls(Some(&buffer), 3, 2));
2454        assert!(!contains_nulls(Some(&buffer), 0, 0));
2455    }
2456
2457    #[test]
2458    fn test_alignment() {
2459        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2460        let sliced = buffer.slice(1);
2461
2462        let mut data = ArrayData {
2463            data_type: DataType::Int32,
2464            len: 0,
2465            offset: 0,
2466            buffers: vec![buffer],
2467            child_data: vec![],
2468            nulls: None,
2469        };
2470        data.validate_full().unwrap();
2471
2472        // break alignment in data
2473        data.buffers[0] = sliced;
2474        let err = data.validate().unwrap_err();
2475
2476        assert_eq!(
2477            err.to_string(),
2478            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2479        );
2480
2481        data.align_buffers();
2482        data.validate_full().unwrap();
2483    }
2484
2485    #[test]
2486    fn test_alignment_struct() {
2487        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2488        let sliced = buffer.slice(1);
2489
2490        let child_data = ArrayData {
2491            data_type: DataType::Int32,
2492            len: 0,
2493            offset: 0,
2494            buffers: vec![buffer],
2495            child_data: vec![],
2496            nulls: None,
2497        };
2498
2499        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2500        let mut data = ArrayData {
2501            data_type: schema,
2502            len: 0,
2503            offset: 0,
2504            buffers: vec![],
2505            child_data: vec![child_data],
2506            nulls: None,
2507        };
2508        data.validate_full().unwrap();
2509
2510        // break alignment in child data
2511        data.child_data[0].buffers[0] = sliced;
2512        let err = data.validate().unwrap_err();
2513
2514        assert_eq!(
2515            err.to_string(),
2516            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2517        );
2518
2519        data.align_buffers();
2520        data.validate_full().unwrap();
2521    }
2522
2523    #[test]
2524    fn test_null_view_types() {
2525        let array_len = 32;
2526        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2527        assert_eq!(array.len(), array_len);
2528        for i in 0..array.len() {
2529            assert!(array.is_null(i));
2530        }
2531
2532        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2533        assert_eq!(array.len(), array_len);
2534        for i in 0..array.len() {
2535            assert!(array.is_null(i));
2536        }
2537
2538        let array = ArrayData::new_null(
2539            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2540            array_len,
2541        );
2542        assert_eq!(array.len(), array_len);
2543        for i in 0..array.len() {
2544            assert!(array.is_null(i));
2545        }
2546
2547        let array = ArrayData::new_null(
2548            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2549            array_len,
2550        );
2551        assert_eq!(array.len(), array_len);
2552        for i in 0..array.len() {
2553            assert!(array.is_null(i));
2554        }
2555    }
2556}