arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257    /// Create a new ArrayData instance;
258    ///
259    /// If `null_count` is not specified, the number of nulls in
260    /// null_bit_buffer is calculated.
261    ///
262    /// If the number of nulls is 0 then the null_bit_buffer
263    /// is set to `None`.
264    ///
265    /// # Safety
266    ///
267    /// The input values *must* form a valid Arrow array for
268    /// `data_type`, or undefined behavior can result.
269    ///
270    /// Note: This is a low level API and most users of the arrow
271    /// crate should create arrays using the methods in the `array`
272    /// module.
273    pub unsafe fn new_unchecked(
274        data_type: DataType,
275        len: usize,
276        null_count: Option<usize>,
277        null_bit_buffer: Option<Buffer>,
278        offset: usize,
279        buffers: Vec<Buffer>,
280        child_data: Vec<ArrayData>,
281    ) -> Self {
282        let mut skip_validation = UnsafeFlag::new();
283        // SAFETY: caller responsible for ensuring data is valid
284        unsafe { skip_validation.set(true) };
285
286        ArrayDataBuilder {
287            data_type,
288            len,
289            null_count,
290            null_bit_buffer,
291            nulls: None,
292            offset,
293            buffers,
294            child_data,
295            align_buffers: false,
296            skip_validation,
297        }
298        .build()
299        .unwrap()
300    }
301
302    /// Create a new ArrayData, validating that the provided buffers form a valid
303    /// Arrow array of the specified data type.
304    ///
305    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306    /// is set to `None`.
307    ///
308    /// Internally this calls through to [`Self::validate_data`]
309    ///
310    /// Note: This is a low level API and most users of the arrow crate should create
311    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312    pub fn try_new(
313        data_type: DataType,
314        len: usize,
315        null_bit_buffer: Option<Buffer>,
316        offset: usize,
317        buffers: Vec<Buffer>,
318        child_data: Vec<ArrayData>,
319    ) -> Result<Self, ArrowError> {
320        // we must check the length of `null_bit_buffer` first
321        // because we use this buffer to calculate `null_count`
322        // in `Self::new_unchecked`.
323        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324            let needed_len = bit_util::ceil(len + offset, 8);
325            if null_bit_buffer.len() < needed_len {
326                return Err(ArrowError::InvalidArgumentError(format!(
327                    "null_bit_buffer size too small. got {} needed {}",
328                    null_bit_buffer.len(),
329                    needed_len
330                )));
331            }
332        }
333        // Safety justification: `validate_full` is called below
334        let new_self = unsafe {
335            Self::new_unchecked(
336                data_type,
337                len,
338                None,
339                null_bit_buffer,
340                offset,
341                buffers,
342                child_data,
343            )
344        };
345
346        // As the data is not trusted, do a full validation of its contents
347        // We don't need to validate children as we can assume that the
348        // [`ArrayData`] in `child_data` have already been validated through
349        // a call to `ArrayData::try_new` or created using unsafe
350        new_self.validate_data()?;
351        Ok(new_self)
352    }
353
354    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
355    #[inline]
356    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357        ArrayDataBuilder::new(data_type)
358    }
359
360    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
361    #[inline]
362    pub const fn data_type(&self) -> &DataType {
363        &self.data_type
364    }
365
366    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
367    pub fn buffers(&self) -> &[Buffer] {
368        &self.buffers
369    }
370
371    /// Returns a slice of children [`ArrayData`]. This will be non
372    /// empty for type such as lists and structs.
373    pub fn child_data(&self) -> &[ArrayData] {
374        &self.child_data[..]
375    }
376
377    /// Returns whether the element at index `i` is null
378    #[inline]
379    pub fn is_null(&self, i: usize) -> bool {
380        match &self.nulls {
381            Some(v) => v.is_null(i),
382            None => false,
383        }
384    }
385
386    /// Returns a reference to the null buffer of this [`ArrayData`] if any
387    ///
388    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
389    #[inline]
390    pub fn nulls(&self) -> Option<&NullBuffer> {
391        self.nulls.as_ref()
392    }
393
394    /// Returns whether the element at index `i` is not null
395    #[inline]
396    pub fn is_valid(&self, i: usize) -> bool {
397        !self.is_null(i)
398    }
399
400    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
401    #[inline]
402    pub const fn len(&self) -> usize {
403        self.len
404    }
405
406    /// Returns whether this [`ArrayData`] is empty
407    #[inline]
408    pub const fn is_empty(&self) -> bool {
409        self.len == 0
410    }
411
412    /// Returns the offset of this [`ArrayData`]
413    #[inline]
414    pub const fn offset(&self) -> usize {
415        self.offset
416    }
417
418    /// Returns the total number of nulls in this array
419    #[inline]
420    pub fn null_count(&self) -> usize {
421        self.nulls
422            .as_ref()
423            .map(|x| x.null_count())
424            .unwrap_or_default()
425    }
426
427    /// Returns the total number of bytes of memory occupied by the
428    /// buffers owned by this [`ArrayData`] and all of its
429    /// children. (See also diagram on [`ArrayData`]).
430    ///
431    /// Note that this [`ArrayData`] may only refer to a subset of the
432    /// data in the underlying [`Buffer`]s (due to `offset` and
433    /// `length`), but the size returned includes the entire size of
434    /// the buffers.
435    ///
436    /// If multiple [`ArrayData`]s refer to the same underlying
437    /// [`Buffer`]s they will both report the same size.
438    pub fn get_buffer_memory_size(&self) -> usize {
439        let mut size = 0;
440        for buffer in &self.buffers {
441            size += buffer.capacity();
442        }
443        if let Some(bitmap) = &self.nulls {
444            size += bitmap.buffer().capacity()
445        }
446        for child in &self.child_data {
447            size += child.get_buffer_memory_size();
448        }
449        size
450    }
451
452    /// Returns the total number of the bytes of memory occupied by
453    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
454    ///
455    /// This is approximately the number of bytes if a new
456    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
457    /// exactly the data needed.
458    ///
459    /// For example, a [`DataType::Int64`] with `100` elements,
460    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
461    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
462    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
463    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
464    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465        let mut result: usize = 0;
466        let layout = layout(&self.data_type);
467
468        for spec in layout.buffers.iter() {
469            match spec {
470                BufferSpec::FixedWidth { byte_width, .. } => {
471                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472                        ArrowError::ComputeError(
473                            "Integer overflow computing buffer size".to_string(),
474                        )
475                    })?;
476                    result += buffer_size;
477                }
478                BufferSpec::VariableWidth => {
479                    let buffer_len = match self.data_type {
480                        DataType::Utf8 | DataType::Binary => {
481                            let offsets = self.typed_offsets::<i32>()?;
482                            (offsets[self.len] - offsets[0]) as usize
483                        }
484                        DataType::LargeUtf8 | DataType::LargeBinary => {
485                            let offsets = self.typed_offsets::<i64>()?;
486                            (offsets[self.len] - offsets[0]) as usize
487                        }
488                        _ => {
489                            return Err(ArrowError::NotYetImplemented(format!(
490                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491                                self.data_type
492                            )));
493                        }
494                    };
495                    result += buffer_len;
496                }
497                BufferSpec::BitMap => {
498                    let buffer_size = bit_util::ceil(self.len, 8);
499                    result += buffer_size;
500                }
501                BufferSpec::AlwaysNull => {
502                    // Nothing to do
503                }
504            }
505        }
506
507        if self.nulls().is_some() {
508            result += bit_util::ceil(self.len, 8);
509        }
510
511        for child in &self.child_data {
512            result += child.get_slice_memory_size()?;
513        }
514        Ok(result)
515    }
516
517    /// Returns the total number of bytes of memory occupied
518    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
519    /// children. (See also diagram on [`ArrayData`]).
520    ///
521    /// Equivalent to:
522    ///  `size_of_val(self)` +
523    ///  [`Self::get_buffer_memory_size`] +
524    ///  `size_of_val(child)` for all children
525    pub fn get_array_memory_size(&self) -> usize {
526        let mut size = mem::size_of_val(self);
527
528        // Calculate rest of the fields top down which contain actual data
529        for buffer in &self.buffers {
530            size += mem::size_of::<Buffer>();
531            size += buffer.capacity();
532        }
533        if let Some(nulls) = &self.nulls {
534            size += nulls.buffer().capacity();
535        }
536        for child in &self.child_data {
537            size += child.get_array_memory_size();
538        }
539
540        size
541    }
542
543    /// Creates a zero-copy slice of itself. This creates a new
544    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
545    /// different offset and len
546    ///
547    /// # Panics
548    ///
549    /// Panics if `offset + length > self.len()`.
550    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551        assert!((offset + length) <= self.len());
552
553        if let DataType::Struct(_) = self.data_type() {
554            // Slice into children
555            let new_offset = self.offset + offset;
556            ArrayData {
557                data_type: self.data_type().clone(),
558                len: length,
559                offset: new_offset,
560                buffers: self.buffers.clone(),
561                // Slice child data, to propagate offsets down to them
562                child_data: self
563                    .child_data()
564                    .iter()
565                    .map(|data| data.slice(offset, length))
566                    .collect(),
567                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568            }
569        } else {
570            let mut new_data = self.clone();
571
572            new_data.len = length;
573            new_data.offset = offset + self.offset;
574            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
575
576            new_data
577        }
578    }
579
580    /// Returns the `buffer` as a slice of type `T` starting at self.offset
581    ///
582    /// # Panics
583    /// This function panics if:
584    /// * the buffer is not byte-aligned with type T, or
585    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
586    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587        &self.buffers()[buffer].typed_data()[self.offset..]
588    }
589
590    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
591    pub fn new_null(data_type: &DataType, len: usize) -> Self {
592        let bit_len = bit_util::ceil(len, 8);
593        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596            Some(width) => (vec![zeroed(width * len)], vec![], true),
597            None => match data_type {
598                DataType::Null => (vec![], vec![], false),
599                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600                DataType::Binary | DataType::Utf8 => {
601                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602                }
603                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604                DataType::LargeBinary | DataType::LargeUtf8 => {
605                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606                }
607                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608                DataType::List(f) | DataType::Map(f, _) => (
609                    vec![zeroed((len + 1) * 4)],
610                    vec![ArrayData::new_empty(f.data_type())],
611                    true,
612                ),
613                DataType::LargeList(f) => (
614                    vec![zeroed((len + 1) * 8)],
615                    vec![ArrayData::new_empty(f.data_type())],
616                    true,
617                ),
618                DataType::ListView(f) => (
619                    vec![zeroed(len * 4), zeroed(len * 4)],
620                    vec![ArrayData::new_empty(f.data_type())],
621                    true,
622                ),
623                DataType::LargeListView(f) => (
624                    vec![zeroed(len * 8), zeroed(len * 8)],
625                    vec![ArrayData::new_empty(f.data_type())],
626                    true,
627                ),
628                DataType::FixedSizeList(f, list_len) => (
629                    vec![],
630                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
631                    true,
632                ),
633                DataType::Struct(fields) => (
634                    vec![],
635                    fields
636                        .iter()
637                        .map(|f| Self::new_null(f.data_type(), len))
638                        .collect(),
639                    true,
640                ),
641                DataType::Dictionary(k, v) => (
642                    vec![zeroed(k.primitive_width().unwrap() * len)],
643                    vec![ArrayData::new_empty(v.as_ref())],
644                    true,
645                ),
646                DataType::Union(f, mode) => {
647                    let (id, _) = f.iter().next().unwrap();
648                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
649                    let buffers = match mode {
650                        UnionMode::Sparse => vec![ids],
651                        UnionMode::Dense => {
652                            let end_offset = i32::from_usize(len).unwrap();
653                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
654                        }
655                    };
656
657                    let children = f
658                        .iter()
659                        .enumerate()
660                        .map(|(idx, (_, f))| {
661                            if idx == 0 || *mode == UnionMode::Sparse {
662                                Self::new_null(f.data_type(), len)
663                            } else {
664                                Self::new_empty(f.data_type())
665                            }
666                        })
667                        .collect();
668
669                    (buffers, children, false)
670                }
671                DataType::RunEndEncoded(r, v) => {
672                    let runs = match r.data_type() {
673                        DataType::Int16 => {
674                            let i = i16::from_usize(len).expect("run overflow");
675                            Buffer::from_slice_ref([i])
676                        }
677                        DataType::Int32 => {
678                            let i = i32::from_usize(len).expect("run overflow");
679                            Buffer::from_slice_ref([i])
680                        }
681                        DataType::Int64 => {
682                            let i = i64::from_usize(len).expect("run overflow");
683                            Buffer::from_slice_ref([i])
684                        }
685                        dt => unreachable!("Invalid run ends data type {dt}"),
686                    };
687
688                    let builder = ArrayData::builder(r.data_type().clone())
689                        .len(1)
690                        .buffers(vec![runs]);
691
692                    // SAFETY:
693                    // Valid by construction
694                    let runs = unsafe { builder.build_unchecked() };
695                    (
696                        vec![],
697                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
698                        false,
699                    )
700                }
701                d => unreachable!("{d}"),
702            },
703        };
704
705        let mut builder = ArrayDataBuilder::new(data_type.clone())
706            .len(len)
707            .buffers(buffers)
708            .child_data(child_data);
709
710        if has_nulls {
711            builder = builder.nulls(Some(NullBuffer::new_null(len)))
712        }
713
714        // SAFETY:
715        // Data valid by construction
716        unsafe { builder.build_unchecked() }
717    }
718
719    /// Returns a new empty [ArrayData] valid for `data_type`.
720    pub fn new_empty(data_type: &DataType) -> Self {
721        Self::new_null(data_type, 0)
722    }
723
724    /// Verifies that the buffers meet the minimum alignment requirements for the data type
725    ///
726    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
727    ///
728    /// This can be useful for when interacting with data sent over IPC or FFI, that may
729    /// not meet the minimum alignment requirements
730    ///
731    /// This also aligns buffers of children data
732    pub fn align_buffers(&mut self) {
733        let layout = layout(&self.data_type);
734        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
735            if let BufferSpec::FixedWidth { alignment, .. } = spec {
736                if buffer.as_ptr().align_offset(*alignment) != 0 {
737                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
738                }
739            }
740        }
741        // align children data recursively
742        for data in self.child_data.iter_mut() {
743            data.align_buffers()
744        }
745    }
746
747    /// "cheap" validation of an `ArrayData`. Ensures buffers are
748    /// sufficiently sized to store `len` + `offset` total elements of
749    /// `data_type` and performs other inexpensive consistency checks.
750    ///
751    /// This check is "cheap" in the sense that it does not validate the
752    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
753    /// are within the bounds of the values buffer).
754    ///
755    /// See [ArrayData::validate_data] to validate fully the offset content
756    /// and the validity of utf8 data
757    pub fn validate(&self) -> Result<(), ArrowError> {
758        // Need at least this mich space in each buffer
759        let len_plus_offset = self.len + self.offset;
760
761        // Check that the data layout conforms to the spec
762        let layout = layout(&self.data_type);
763
764        if !layout.can_contain_null_mask && self.nulls.is_some() {
765            return Err(ArrowError::InvalidArgumentError(format!(
766                "Arrays of type {:?} cannot contain a null bitmask",
767                self.data_type,
768            )));
769        }
770
771        // Check data buffers length for view types and other types
772        if self.buffers.len() < layout.buffers.len()
773            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
774        {
775            return Err(ArrowError::InvalidArgumentError(format!(
776                "Expected {} buffers in array of type {:?}, got {}",
777                layout.buffers.len(),
778                self.data_type,
779                self.buffers.len(),
780            )));
781        }
782
783        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
784            match spec {
785                BufferSpec::FixedWidth {
786                    byte_width,
787                    alignment,
788                } => {
789                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
790
791                    if buffer.len() < min_buffer_size {
792                        return Err(ArrowError::InvalidArgumentError(format!(
793                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
794                            min_buffer_size,
795                            i,
796                            self.data_type,
797                            buffer.len()
798                        )));
799                    }
800
801                    let align_offset = buffer.as_ptr().align_offset(*alignment);
802                    if align_offset != 0 {
803                        return Err(ArrowError::InvalidArgumentError(format!(
804                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
805                            self.data_type,
806                            align_offset.min(alignment - align_offset)
807                        )));
808                    }
809                }
810                BufferSpec::VariableWidth => {
811                    // not cheap to validate (need to look at the
812                    // data). Partially checked in validate_offsets
813                    // called below. Can check with `validate_full`
814                }
815                BufferSpec::BitMap => {
816                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
817                    if buffer.len() < min_buffer_size {
818                        return Err(ArrowError::InvalidArgumentError(format!(
819                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
820                            min_buffer_size,
821                            i,
822                            self.data_type,
823                            buffer.len()
824                        )));
825                    }
826                }
827                BufferSpec::AlwaysNull => {
828                    // Nothing to validate
829                }
830            }
831        }
832
833        // check null bit buffer size
834        if let Some(nulls) = self.nulls() {
835            if nulls.null_count() > self.len {
836                return Err(ArrowError::InvalidArgumentError(format!(
837                    "null_count {} for an array exceeds length of {} elements",
838                    nulls.null_count(),
839                    self.len
840                )));
841            }
842
843            let actual_len = nulls.validity().len();
844            let needed_len = bit_util::ceil(len_plus_offset, 8);
845            if actual_len < needed_len {
846                return Err(ArrowError::InvalidArgumentError(format!(
847                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
848                )));
849            }
850
851            if nulls.len() != self.len {
852                return Err(ArrowError::InvalidArgumentError(format!(
853                    "null buffer incorrect size. got {} expected {}",
854                    nulls.len(),
855                    self.len
856                )));
857            }
858        }
859
860        self.validate_child_data()?;
861
862        // Additional Type specific checks
863        match &self.data_type {
864            DataType::Utf8 | DataType::Binary => {
865                self.validate_offsets::<i32>(self.buffers[1].len())?;
866            }
867            DataType::LargeUtf8 | DataType::LargeBinary => {
868                self.validate_offsets::<i64>(self.buffers[1].len())?;
869            }
870            DataType::Dictionary(key_type, _value_type) => {
871                // At the moment, constructing a DictionaryArray will also check this
872                if !DataType::is_dictionary_key_type(key_type) {
873                    return Err(ArrowError::InvalidArgumentError(format!(
874                        "Dictionary key type must be integer, but was {key_type}"
875                    )));
876                }
877            }
878            DataType::RunEndEncoded(run_ends_type, _) => {
879                if run_ends_type.is_nullable() {
880                    return Err(ArrowError::InvalidArgumentError(
881                        "The nullable should be set to false for the field defining run_ends array.".to_string()
882                    ));
883                }
884                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
885                    return Err(ArrowError::InvalidArgumentError(format!(
886                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
887                        run_ends_type.data_type()
888                    )));
889                }
890            }
891            _ => {}
892        };
893
894        Ok(())
895    }
896
897    /// Returns a reference to the data in `buffer` as a typed slice
898    /// (typically `&[i32]` or `&[i64]`) after validating. The
899    /// returned slice is guaranteed to have at least `self.len + 1`
900    /// entries.
901    ///
902    /// For an empty array, the `buffer` can also be empty.
903    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
904        // An empty list-like array can have 0 offsets
905        if self.len == 0 && self.buffers[0].is_empty() {
906            return Ok(&[]);
907        }
908
909        self.typed_buffer(0, self.len + 1)
910    }
911
912    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
913    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
914        &self,
915        idx: usize,
916        len: usize,
917    ) -> Result<&[T], ArrowError> {
918        let buffer = &self.buffers[idx];
919
920        let required_len = (len + self.offset) * mem::size_of::<T>();
921
922        if buffer.len() < required_len {
923            return Err(ArrowError::InvalidArgumentError(format!(
924                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
925                idx,
926                self.data_type,
927                required_len,
928                buffer.len()
929            )));
930        }
931
932        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
933    }
934
935    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
936    /// offsets (of type T) into some other buffer of `values_length` bytes long
937    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
938        &self,
939        values_length: usize,
940    ) -> Result<(), ArrowError> {
941        // Justification: buffer size was validated above
942        let offsets = self.typed_offsets::<T>()?;
943        if offsets.is_empty() {
944            return Ok(());
945        }
946
947        let first_offset = offsets[0].to_usize().ok_or_else(|| {
948            ArrowError::InvalidArgumentError(format!(
949                "Error converting offset[0] ({}) to usize for {}",
950                offsets[0], self.data_type
951            ))
952        })?;
953
954        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
955            ArrowError::InvalidArgumentError(format!(
956                "Error converting offset[{}] ({}) to usize for {}",
957                self.len, offsets[self.len], self.data_type
958            ))
959        })?;
960
961        if first_offset > values_length {
962            return Err(ArrowError::InvalidArgumentError(format!(
963                "First offset {} of {} is larger than values length {}",
964                first_offset, self.data_type, values_length,
965            )));
966        }
967
968        if last_offset > values_length {
969            return Err(ArrowError::InvalidArgumentError(format!(
970                "Last offset {} of {} is larger than values length {}",
971                last_offset, self.data_type, values_length,
972            )));
973        }
974
975        if first_offset > last_offset {
976            return Err(ArrowError::InvalidArgumentError(format!(
977                "First offset {} in {} is smaller than last offset {}",
978                first_offset, self.data_type, last_offset,
979            )));
980        }
981
982        Ok(())
983    }
984
985    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
986    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
987    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
988        &self,
989        values_length: usize,
990    ) -> Result<(), ArrowError> {
991        let offsets: &[T] = self.typed_buffer(0, self.len)?;
992        let sizes: &[T] = self.typed_buffer(1, self.len)?;
993        if offsets.len() != sizes.len() {
994            return Err(ArrowError::ComputeError(format!(
995                "ListView offsets len {} does not match sizes len {}",
996                offsets.len(),
997                sizes.len()
998            )));
999        }
1000
1001        for i in 0..sizes.len() {
1002            let size = sizes[i].to_usize().ok_or_else(|| {
1003                ArrowError::InvalidArgumentError(format!(
1004                    "Error converting size[{}] ({}) to usize for {}",
1005                    i, sizes[i], self.data_type
1006                ))
1007            })?;
1008            let offset = offsets[i].to_usize().ok_or_else(|| {
1009                ArrowError::InvalidArgumentError(format!(
1010                    "Error converting offset[{}] ({}) to usize for {}",
1011                    i, offsets[i], self.data_type
1012                ))
1013            })?;
1014            if size
1015                .checked_add(offset)
1016                .expect("Offset and size have exceeded the usize boundary")
1017                > values_length
1018            {
1019                return Err(ArrowError::InvalidArgumentError(format!(
1020                    "Size {} at index {} is larger than the remaining values for {}",
1021                    size, i, self.data_type
1022                )));
1023            }
1024        }
1025        Ok(())
1026    }
1027
1028    /// Validates the layout of `child_data` ArrayData structures
1029    fn validate_child_data(&self) -> Result<(), ArrowError> {
1030        match &self.data_type {
1031            DataType::List(field) | DataType::Map(field, _) => {
1032                let values_data = self.get_single_valid_child_data(field.data_type())?;
1033                self.validate_offsets::<i32>(values_data.len)?;
1034                Ok(())
1035            }
1036            DataType::LargeList(field) => {
1037                let values_data = self.get_single_valid_child_data(field.data_type())?;
1038                self.validate_offsets::<i64>(values_data.len)?;
1039                Ok(())
1040            }
1041            DataType::ListView(field) => {
1042                let values_data = self.get_single_valid_child_data(field.data_type())?;
1043                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1044                Ok(())
1045            }
1046            DataType::LargeListView(field) => {
1047                let values_data = self.get_single_valid_child_data(field.data_type())?;
1048                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1049                Ok(())
1050            }
1051            DataType::FixedSizeList(field, list_size) => {
1052                let values_data = self.get_single_valid_child_data(field.data_type())?;
1053
1054                let list_size: usize = (*list_size).try_into().map_err(|_| {
1055                    ArrowError::InvalidArgumentError(format!(
1056                        "{} has a negative list_size {}",
1057                        self.data_type, list_size
1058                    ))
1059                })?;
1060
1061                let expected_values_len = self.len
1062                    .checked_mul(list_size)
1063                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1064
1065                if values_data.len < expected_values_len {
1066                    return Err(ArrowError::InvalidArgumentError(format!(
1067                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1068                        values_data.len, self.len, list_size, self.data_type
1069                    )));
1070                }
1071
1072                Ok(())
1073            }
1074            DataType::Struct(fields) => {
1075                self.validate_num_child_data(fields.len())?;
1076                for (i, field) in fields.iter().enumerate() {
1077                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1078
1079                    // Ensure child field has sufficient size
1080                    if field_data.len < self.len {
1081                        return Err(ArrowError::InvalidArgumentError(format!(
1082                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1083                            self.data_type,
1084                            i,
1085                            field.name(),
1086                            field_data.len,
1087                            self.len
1088                        )));
1089                    }
1090                }
1091                Ok(())
1092            }
1093            DataType::RunEndEncoded(run_ends_field, values_field) => {
1094                self.validate_num_child_data(2)?;
1095                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1096                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1097                if run_ends_data.len != values_data.len {
1098                    return Err(ArrowError::InvalidArgumentError(format!(
1099                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1100                        run_ends_data.len, values_data.len
1101                    )));
1102                }
1103                if run_ends_data.nulls.is_some() {
1104                    return Err(ArrowError::InvalidArgumentError(
1105                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1106                    ));
1107                }
1108                Ok(())
1109            }
1110            DataType::Union(fields, mode) => {
1111                self.validate_num_child_data(fields.len())?;
1112
1113                for (i, (_, field)) in fields.iter().enumerate() {
1114                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1115
1116                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1117                        return Err(ArrowError::InvalidArgumentError(format!(
1118                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1119                            i,
1120                            field_data.len,
1121                            self.len + self.offset
1122                        )));
1123                    }
1124                }
1125                Ok(())
1126            }
1127            DataType::Dictionary(_key_type, value_type) => {
1128                self.get_single_valid_child_data(value_type)?;
1129                Ok(())
1130            }
1131            _ => {
1132                // other types do not have child data
1133                if !self.child_data.is_empty() {
1134                    return Err(ArrowError::InvalidArgumentError(format!(
1135                        "Expected no child arrays for type {} but got {}",
1136                        self.data_type,
1137                        self.child_data.len()
1138                    )));
1139                }
1140                Ok(())
1141            }
1142        }
1143    }
1144
1145    /// Ensures that this array data has a single child_data with the
1146    /// expected type, and calls `validate()` on it. Returns a
1147    /// reference to that child_data
1148    fn get_single_valid_child_data(
1149        &self,
1150        expected_type: &DataType,
1151    ) -> Result<&ArrayData, ArrowError> {
1152        self.validate_num_child_data(1)?;
1153        self.get_valid_child_data(0, expected_type)
1154    }
1155
1156    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1157    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1158        if self.child_data.len() != expected_len {
1159            Err(ArrowError::InvalidArgumentError(format!(
1160                "Value data for {} should contain {} child data array(s), had {}",
1161                self.data_type,
1162                expected_len,
1163                self.child_data.len()
1164            )))
1165        } else {
1166            Ok(())
1167        }
1168    }
1169
1170    /// Ensures that `child_data[i]` has the expected type, calls
1171    /// `validate()` on it, and returns a reference to that child_data
1172    fn get_valid_child_data(
1173        &self,
1174        i: usize,
1175        expected_type: &DataType,
1176    ) -> Result<&ArrayData, ArrowError> {
1177        let values_data = self.child_data.get(i).ok_or_else(|| {
1178            ArrowError::InvalidArgumentError(format!(
1179                "{} did not have enough child arrays. Expected at least {} but had only {}",
1180                self.data_type,
1181                i + 1,
1182                self.child_data.len()
1183            ))
1184        })?;
1185
1186        if expected_type != &values_data.data_type {
1187            return Err(ArrowError::InvalidArgumentError(format!(
1188                "Child type mismatch for {}. Expected {} but child data had {}",
1189                self.data_type, expected_type, values_data.data_type
1190            )));
1191        }
1192
1193        values_data.validate()?;
1194        Ok(values_data)
1195    }
1196
1197    /// Validate that the data contained within this [`ArrayData`] is valid
1198    ///
1199    /// 1. Null count is correct
1200    /// 2. All offsets are valid
1201    /// 3. All String data is valid UTF-8
1202    /// 4. All dictionary offsets are valid
1203    ///
1204    /// Internally this calls:
1205    ///
1206    /// * [`Self::validate`]
1207    /// * [`Self::validate_nulls`]
1208    /// * [`Self::validate_values`]
1209    ///
1210    /// Note: this does not recurse into children, for a recursive variant
1211    /// see [`Self::validate_full`]
1212    pub fn validate_data(&self) -> Result<(), ArrowError> {
1213        self.validate()?;
1214
1215        self.validate_nulls()?;
1216        self.validate_values()?;
1217        Ok(())
1218    }
1219
1220    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1221    ///
1222    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1223    /// and all its children recursively
1224    pub fn validate_full(&self) -> Result<(), ArrowError> {
1225        self.validate_data()?;
1226        // validate all children recursively
1227        self.child_data
1228            .iter()
1229            .enumerate()
1230            .try_for_each(|(i, child_data)| {
1231                child_data.validate_full().map_err(|e| {
1232                    ArrowError::InvalidArgumentError(format!(
1233                        "{} child #{} invalid: {}",
1234                        self.data_type, i, e
1235                    ))
1236                })
1237            })?;
1238        Ok(())
1239    }
1240
1241    /// Validates the values stored within this [`ArrayData`] are valid
1242    /// without recursing into child [`ArrayData`]
1243    ///
1244    /// Does not (yet) check
1245    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1246    /// 2. the the null count is correct and that any
1247    /// 3. nullability requirements of its children are correct
1248    ///
1249    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1250    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1251        if let Some(nulls) = &self.nulls {
1252            let actual = nulls.len() - nulls.inner().count_set_bits();
1253            if actual != nulls.null_count() {
1254                return Err(ArrowError::InvalidArgumentError(format!(
1255                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1256                    nulls.null_count(),
1257                    actual
1258                )));
1259            }
1260        }
1261
1262        // In general non-nullable children should not contain nulls, however, for certain
1263        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1264        // space in the child. As such we permit nulls in the children in the corresponding
1265        // positions for such types
1266        match &self.data_type {
1267            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1268                if !f.is_nullable() {
1269                    self.validate_non_nullable(None, &self.child_data[0])?
1270                }
1271            }
1272            DataType::FixedSizeList(field, len) => {
1273                let child = &self.child_data[0];
1274                if !field.is_nullable() {
1275                    match &self.nulls {
1276                        Some(nulls) => {
1277                            let element_len = *len as usize;
1278                            let expanded = nulls.expand(element_len);
1279                            self.validate_non_nullable(Some(&expanded), child)?;
1280                        }
1281                        None => self.validate_non_nullable(None, child)?,
1282                    }
1283                }
1284            }
1285            DataType::Struct(fields) => {
1286                for (field, child) in fields.iter().zip(&self.child_data) {
1287                    if !field.is_nullable() {
1288                        self.validate_non_nullable(self.nulls(), child)?
1289                    }
1290                }
1291            }
1292            _ => {}
1293        }
1294
1295        Ok(())
1296    }
1297
1298    /// Verifies that `child` contains no nulls not present in `mask`
1299    fn validate_non_nullable(
1300        &self,
1301        mask: Option<&NullBuffer>,
1302        child: &ArrayData,
1303    ) -> Result<(), ArrowError> {
1304        let mask = match mask {
1305            Some(mask) => mask,
1306            None => {
1307                return match child.null_count() {
1308                    0 => Ok(()),
1309                    _ => Err(ArrowError::InvalidArgumentError(format!(
1310                        "non-nullable child of type {} contains nulls not present in parent {}",
1311                        child.data_type, self.data_type
1312                    ))),
1313                };
1314            }
1315        };
1316
1317        match child.nulls() {
1318            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1319                "non-nullable child of type {} contains nulls not present in parent",
1320                child.data_type
1321            ))),
1322            _ => Ok(()),
1323        }
1324    }
1325
1326    /// Validates the values stored within this [`ArrayData`] are valid
1327    /// without recursing into child [`ArrayData`]
1328    ///
1329    /// Does not (yet) check
1330    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1331    pub fn validate_values(&self) -> Result<(), ArrowError> {
1332        match &self.data_type {
1333            DataType::Utf8 => self.validate_utf8::<i32>(),
1334            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1335            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1336            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1337            DataType::BinaryView => {
1338                let views = self.typed_buffer::<u128>(0, self.len)?;
1339                validate_binary_view(views, &self.buffers[1..])
1340            }
1341            DataType::Utf8View => {
1342                let views = self.typed_buffer::<u128>(0, self.len)?;
1343                validate_string_view(views, &self.buffers[1..])
1344            }
1345            DataType::List(_) | DataType::Map(_, _) => {
1346                let child = &self.child_data[0];
1347                self.validate_offsets_full::<i32>(child.len)
1348            }
1349            DataType::LargeList(_) => {
1350                let child = &self.child_data[0];
1351                self.validate_offsets_full::<i64>(child.len)
1352            }
1353            DataType::Union(_, _) => {
1354                // Validate Union Array as part of implementing new Union semantics
1355                // See comments in `ArrayData::validate()`
1356                // https://github.com/apache/arrow-rs/issues/85
1357                //
1358                // TODO file follow on ticket for full union validation
1359                Ok(())
1360            }
1361            DataType::Dictionary(key_type, _value_type) => {
1362                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1363                let max_value = dictionary_length - 1;
1364                match key_type.as_ref() {
1365                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1366                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1367                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1368                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1369                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1370                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1371                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1372                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1373                    _ => unreachable!(),
1374                }
1375            }
1376            DataType::RunEndEncoded(run_ends, _values) => {
1377                let run_ends_data = self.child_data()[0].clone();
1378                match run_ends.data_type() {
1379                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1380                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1381                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1382                    _ => unreachable!(),
1383                }
1384            }
1385            _ => {
1386                // No extra validation check required for other types
1387                Ok(())
1388            }
1389        }
1390    }
1391
1392    /// Calls the `validate(item_index, range)` function for each of
1393    /// the ranges specified in the arrow offsets buffer of type
1394    /// `T`. Also validates that each offset is smaller than
1395    /// `offset_limit`
1396    ///
1397    /// For an empty array, the offsets buffer can either be empty
1398    /// or contain a single `0`.
1399    ///
1400    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1401    /// function would call `validate([1,2])`, and `validate([2,4])`
1402    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1403    where
1404        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1405        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1406    {
1407        self.typed_offsets::<T>()?
1408            .iter()
1409            .enumerate()
1410            .map(|(i, x)| {
1411                // check if the offset can be converted to usize
1412                let r = x.to_usize().ok_or_else(|| {
1413                    ArrowError::InvalidArgumentError(format!(
1414                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1415                    );
1416                // check if the offset exceeds the limit
1417                match r {
1418                    Ok(n) if n <= offset_limit => Ok((i, n)),
1419                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1420                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1421                    ),
1422                    Err(e) => Err(e),
1423                }
1424            })
1425            .scan(0_usize, |start, end| {
1426                // check offsets are monotonically increasing
1427                match end {
1428                    Ok((i, end)) if *start <= end => {
1429                        let range = Some(Ok((i, *start..end)));
1430                        *start = end;
1431                        range
1432                    }
1433                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1434                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1435                        i - 1, start, end))
1436                    )),
1437                    Err(err) => Some(Err(err)),
1438                }
1439            })
1440            .skip(1) // the first element is meaningless
1441            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1442                let (item_index, range) = res?;
1443                validate(item_index-1, range)
1444            })
1445    }
1446
1447    /// Ensures that all strings formed by the offsets in `buffers[0]`
1448    /// into `buffers[1]` are valid utf8 sequences
1449    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1450    where
1451        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1452    {
1453        let values_buffer = &self.buffers[1].as_slice();
1454        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1455            // Validate Offsets are correct
1456            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1457                if !values_str.is_char_boundary(range.start)
1458                    || !values_str.is_char_boundary(range.end)
1459                {
1460                    return Err(ArrowError::InvalidArgumentError(format!(
1461                        "incomplete utf-8 byte sequence from index {string_index}"
1462                    )));
1463                }
1464                Ok(())
1465            })
1466        } else {
1467            // find specific offset that failed utf8 validation
1468            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1469                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1470                    ArrowError::InvalidArgumentError(format!(
1471                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1472                    ))
1473                })?;
1474                Ok(())
1475            })
1476        }
1477    }
1478
1479    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1480    /// between `0` and `offset_limit`
1481    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1482    where
1483        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1484    {
1485        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1486            // No validation applied to each value, but the iteration
1487            // itself applies bounds checking to each range
1488            Ok(())
1489        })
1490    }
1491
1492    /// Validates that each value in self.buffers (typed as T)
1493    /// is within the range [0, max_value], inclusive
1494    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1495    where
1496        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1497    {
1498        let required_len = self.len + self.offset;
1499        let buffer = &self.buffers[0];
1500
1501        // This should have been checked as part of `validate()` prior
1502        // to calling `validate_full()` but double check to be sure
1503        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1504
1505        // Justification: buffer size was validated above
1506        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1507
1508        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1509            // Do not check the value is null (value can be arbitrary)
1510            if self.is_null(i) {
1511                return Ok(());
1512            }
1513            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1514                ArrowError::InvalidArgumentError(format!(
1515                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1516                ))
1517            })?;
1518
1519            if dict_index < 0 || dict_index > max_value {
1520                return Err(ArrowError::InvalidArgumentError(format!(
1521                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1522                )));
1523            }
1524            Ok(())
1525        })
1526    }
1527
1528    /// Validates that each value in run_ends array is positive and strictly increasing.
1529    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1530    where
1531        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1532    {
1533        let values = self.typed_buffer::<T>(0, self.len)?;
1534        let mut prev_value: i64 = 0_i64;
1535        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1536            let value: i64 = inp_value.try_into().map_err(|_| {
1537                ArrowError::InvalidArgumentError(format!(
1538                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1539                ))
1540            })?;
1541            if value <= 0_i64 {
1542                return Err(ArrowError::InvalidArgumentError(format!(
1543                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1544                )));
1545            }
1546            if ix > 0 && value <= prev_value {
1547                return Err(ArrowError::InvalidArgumentError(format!(
1548                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1549                )));
1550            }
1551
1552            prev_value = value;
1553            Ok(())
1554        })?;
1555
1556        if prev_value.as_usize() < (self.offset + self.len) {
1557            return Err(ArrowError::InvalidArgumentError(format!(
1558                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1559                self.offset + self.len
1560            )));
1561        }
1562        Ok(())
1563    }
1564
1565    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1566    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1567    /// return false when the arrays are logically equal
1568    pub fn ptr_eq(&self, other: &Self) -> bool {
1569        if self.offset != other.offset
1570            || self.len != other.len
1571            || self.data_type != other.data_type
1572            || self.buffers.len() != other.buffers.len()
1573            || self.child_data.len() != other.child_data.len()
1574        {
1575            return false;
1576        }
1577
1578        match (&self.nulls, &other.nulls) {
1579            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1580            (Some(_), None) | (None, Some(_)) => return false,
1581            _ => {}
1582        };
1583
1584        if !self
1585            .buffers
1586            .iter()
1587            .zip(other.buffers.iter())
1588            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1589        {
1590            return false;
1591        }
1592
1593        self.child_data
1594            .iter()
1595            .zip(other.child_data.iter())
1596            .all(|(a, b)| a.ptr_eq(b))
1597    }
1598
1599    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1600    pub fn into_builder(self) -> ArrayDataBuilder {
1601        self.into()
1602    }
1603}
1604
1605/// Return the expected [`DataTypeLayout`] Arrays of this data
1606/// type are expected to have
1607pub fn layout(data_type: &DataType) -> DataTypeLayout {
1608    // based on C/C++ implementation in
1609    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1610    use arrow_schema::IntervalUnit::*;
1611
1612    match data_type {
1613        DataType::Null => DataTypeLayout {
1614            buffers: vec![],
1615            can_contain_null_mask: false,
1616            variadic: false,
1617        },
1618        DataType::Boolean => DataTypeLayout {
1619            buffers: vec![BufferSpec::BitMap],
1620            can_contain_null_mask: true,
1621            variadic: false,
1622        },
1623        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1624        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1625        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1626        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1627        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1628        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1629        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1630        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1631        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1632        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1633        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1634        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1635        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1636        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1637        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1638        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1639        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1640        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1641        DataType::Interval(MonthDayNano) => {
1642            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1643        }
1644        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1645        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1646        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1647        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1648        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1649        DataType::FixedSizeBinary(size) => {
1650            let spec = BufferSpec::FixedWidth {
1651                byte_width: (*size).try_into().unwrap(),
1652                alignment: mem::align_of::<u8>(),
1653            };
1654            DataTypeLayout {
1655                buffers: vec![spec],
1656                can_contain_null_mask: true,
1657                variadic: false,
1658            }
1659        }
1660        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1661        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1662        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1663        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1664        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1665        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1666        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1667        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1668        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1669        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1670        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1671        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1672        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1673        DataType::Union(_, mode) => {
1674            let type_ids = BufferSpec::FixedWidth {
1675                byte_width: mem::size_of::<i8>(),
1676                alignment: mem::align_of::<i8>(),
1677            };
1678
1679            DataTypeLayout {
1680                buffers: match mode {
1681                    UnionMode::Sparse => {
1682                        vec![type_ids]
1683                    }
1684                    UnionMode::Dense => {
1685                        vec![
1686                            type_ids,
1687                            BufferSpec::FixedWidth {
1688                                byte_width: mem::size_of::<i32>(),
1689                                alignment: mem::align_of::<i32>(),
1690                            },
1691                        ]
1692                    }
1693                },
1694                can_contain_null_mask: false,
1695                variadic: false,
1696            }
1697        }
1698        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1699    }
1700}
1701
1702/// Layout specification for a data type
1703#[derive(Debug, PartialEq, Eq)]
1704// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1705pub struct DataTypeLayout {
1706    /// A vector of buffer layout specifications, one for each expected buffer
1707    pub buffers: Vec<BufferSpec>,
1708
1709    /// Can contain a null bitmask
1710    pub can_contain_null_mask: bool,
1711
1712    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1713    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1714    /// buffers.len(). Buffers that exceed the lower bound are legal.
1715    pub variadic: bool,
1716}
1717
1718impl DataTypeLayout {
1719    /// Describes a basic numeric array where each element has type `T`
1720    pub fn new_fixed_width<T>() -> Self {
1721        Self {
1722            buffers: vec![BufferSpec::FixedWidth {
1723                byte_width: mem::size_of::<T>(),
1724                alignment: mem::align_of::<T>(),
1725            }],
1726            can_contain_null_mask: true,
1727            variadic: false,
1728        }
1729    }
1730
1731    /// Describes arrays which have no data of their own
1732    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1733    pub fn new_nullable_empty() -> Self {
1734        Self {
1735            buffers: vec![],
1736            can_contain_null_mask: true,
1737            variadic: false,
1738        }
1739    }
1740
1741    /// Describes arrays which have no data of their own
1742    /// (e.g. RunEndEncoded).
1743    pub fn new_empty() -> Self {
1744        Self {
1745            buffers: vec![],
1746            can_contain_null_mask: false,
1747            variadic: false,
1748        }
1749    }
1750
1751    /// Describes a basic numeric array where each element has a fixed
1752    /// with offset buffer of type `T`, followed by a
1753    /// variable width data buffer
1754    pub fn new_binary<T>() -> Self {
1755        Self {
1756            buffers: vec![
1757                // offsets
1758                BufferSpec::FixedWidth {
1759                    byte_width: mem::size_of::<T>(),
1760                    alignment: mem::align_of::<T>(),
1761                },
1762                // values
1763                BufferSpec::VariableWidth,
1764            ],
1765            can_contain_null_mask: true,
1766            variadic: false,
1767        }
1768    }
1769
1770    /// Describes a view type
1771    pub fn new_view() -> Self {
1772        Self {
1773            buffers: vec![BufferSpec::FixedWidth {
1774                byte_width: mem::size_of::<u128>(),
1775                alignment: mem::align_of::<u128>(),
1776            }],
1777            can_contain_null_mask: true,
1778            variadic: true,
1779        }
1780    }
1781
1782    /// Describes a list view type
1783    pub fn new_list_view<T>() -> Self {
1784        Self {
1785            buffers: vec![
1786                BufferSpec::FixedWidth {
1787                    byte_width: mem::size_of::<T>(),
1788                    alignment: mem::align_of::<T>(),
1789                },
1790                BufferSpec::FixedWidth {
1791                    byte_width: mem::size_of::<T>(),
1792                    alignment: mem::align_of::<T>(),
1793                },
1794            ],
1795            can_contain_null_mask: true,
1796            variadic: true,
1797        }
1798    }
1799}
1800
1801/// Layout specification for a single data type buffer
1802#[derive(Debug, PartialEq, Eq)]
1803pub enum BufferSpec {
1804    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1805    ///
1806    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1807    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1808    ///
1809    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1810    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1811    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1812    ///
1813    /// Note that these alignment requirements will vary between architectures
1814    FixedWidth {
1815        /// The width of each element in bytes
1816        byte_width: usize,
1817        /// The alignment required by Rust for an array of the corresponding primitive
1818        alignment: usize,
1819    },
1820    /// Variable width, such as string data for utf8 data
1821    VariableWidth,
1822    /// Buffer holds a bitmap.
1823    ///
1824    /// Note: Unlike the C++ implementation, the null/validity buffer
1825    /// is handled specially rather than as another of the buffers in
1826    /// the spec, so this variant is only used for the Boolean type.
1827    BitMap,
1828    /// Buffer is always null. Unused currently in Rust implementation,
1829    /// (used in C++ for Union type)
1830    #[allow(dead_code)]
1831    AlwaysNull,
1832}
1833
1834impl PartialEq for ArrayData {
1835    fn eq(&self, other: &Self) -> bool {
1836        equal::equal(self, other)
1837    }
1838}
1839
1840/// A boolean flag that cannot be mutated outside of unsafe code.
1841///
1842/// Defaults to a value of false.
1843///
1844/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1845///
1846/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1847///
1848/// # Example
1849/// ```rust
1850/// use arrow_data::UnsafeFlag;
1851/// assert!(!UnsafeFlag::default().get()); // default is false
1852/// let mut flag = UnsafeFlag::new();
1853/// assert!(!flag.get()); // defaults to false
1854/// // can only set it to true in unsafe code
1855/// unsafe { flag.set(true) };
1856/// assert!(flag.get()); // now true
1857/// ```
1858#[derive(Debug, Clone)]
1859#[doc(hidden)]
1860pub struct UnsafeFlag(bool);
1861
1862impl UnsafeFlag {
1863    /// Creates a new `UnsafeFlag` with the value set to `false`.
1864    ///
1865    /// See examples on [`Self::new`]
1866    #[inline]
1867    pub const fn new() -> Self {
1868        Self(false)
1869    }
1870
1871    /// Sets the value of the flag to the given value
1872    ///
1873    /// Note this can purposely only be done in `unsafe` code
1874    ///
1875    /// # Safety
1876    ///
1877    /// If set, the flag will be set to the given value. There is nothing
1878    /// immediately unsafe about doing so, however, the flag can be used to
1879    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1880    #[inline]
1881    pub unsafe fn set(&mut self, val: bool) {
1882        self.0 = val;
1883    }
1884
1885    /// Returns the value of the flag
1886    #[inline]
1887    pub fn get(&self) -> bool {
1888        self.0
1889    }
1890}
1891
1892// Manual impl to make it clear you can not construct unsafe with true
1893impl Default for UnsafeFlag {
1894    fn default() -> Self {
1895        Self::new()
1896    }
1897}
1898
1899/// Builder for [`ArrayData`] type
1900#[derive(Debug)]
1901pub struct ArrayDataBuilder {
1902    data_type: DataType,
1903    len: usize,
1904    null_count: Option<usize>,
1905    null_bit_buffer: Option<Buffer>,
1906    nulls: Option<NullBuffer>,
1907    offset: usize,
1908    buffers: Vec<Buffer>,
1909    child_data: Vec<ArrayData>,
1910    /// Should buffers be realigned (copying if necessary)?
1911    ///
1912    /// Defaults to false.
1913    align_buffers: bool,
1914    /// Should data validation be skipped for this [`ArrayData`]?
1915    ///
1916    /// Defaults to false.
1917    ///
1918    /// # Safety
1919    ///
1920    /// This flag can only be set to true using `unsafe` APIs. However, once true
1921    /// subsequent calls to `build()` may result in undefined behavior if the data
1922    /// is not valid.
1923    skip_validation: UnsafeFlag,
1924}
1925
1926impl ArrayDataBuilder {
1927    #[inline]
1928    /// Creates a new array data builder
1929    pub const fn new(data_type: DataType) -> Self {
1930        Self {
1931            data_type,
1932            len: 0,
1933            null_count: None,
1934            null_bit_buffer: None,
1935            nulls: None,
1936            offset: 0,
1937            buffers: vec![],
1938            child_data: vec![],
1939            align_buffers: false,
1940            skip_validation: UnsafeFlag::new(),
1941        }
1942    }
1943
1944    /// Creates a new array data builder from an existing one, changing the data type
1945    pub fn data_type(self, data_type: DataType) -> Self {
1946        Self { data_type, ..self }
1947    }
1948
1949    #[inline]
1950    #[allow(clippy::len_without_is_empty)]
1951    /// Sets the length of the [ArrayData]
1952    pub const fn len(mut self, n: usize) -> Self {
1953        self.len = n;
1954        self
1955    }
1956
1957    /// Sets the null buffer of the [ArrayData]
1958    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1959        self.nulls = nulls;
1960        self.null_count = None;
1961        self.null_bit_buffer = None;
1962        self
1963    }
1964
1965    /// Sets the null count of the [ArrayData]
1966    pub fn null_count(mut self, null_count: usize) -> Self {
1967        self.null_count = Some(null_count);
1968        self
1969    }
1970
1971    /// Sets the `null_bit_buffer` of the [ArrayData]
1972    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1973        self.nulls = None;
1974        self.null_bit_buffer = buf;
1975        self
1976    }
1977
1978    /// Sets the offset of the [ArrayData]
1979    #[inline]
1980    pub const fn offset(mut self, n: usize) -> Self {
1981        self.offset = n;
1982        self
1983    }
1984
1985    /// Sets the buffers of the [ArrayData]
1986    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1987        self.buffers = v;
1988        self
1989    }
1990
1991    /// Adds a single buffer to the [ArrayData]'s buffers
1992    pub fn add_buffer(mut self, b: Buffer) -> Self {
1993        self.buffers.push(b);
1994        self
1995    }
1996
1997    /// Adds multiple buffers to the [ArrayData]'s buffers
1998    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1999        self.buffers.extend(bs);
2000        self
2001    }
2002
2003    /// Sets the child data of the [ArrayData]
2004    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2005        self.child_data = v;
2006        self
2007    }
2008
2009    /// Adds a single child data to the [ArrayData]'s child data
2010    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2011        self.child_data.push(r);
2012        self
2013    }
2014
2015    /// Creates an array data, without any validation
2016    ///
2017    /// Note: This is shorthand for
2018    /// ```rust
2019    /// # #[expect(unsafe_op_in_unsafe_fn)]
2020    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2021    /// # let _ = unsafe {
2022    /// builder.skip_validation(true).build().unwrap()
2023    /// # };
2024    /// ```
2025    ///
2026    /// # Safety
2027    ///
2028    /// The same caveats as [`ArrayData::new_unchecked`]
2029    /// apply.
2030    pub unsafe fn build_unchecked(self) -> ArrayData {
2031        unsafe { self.skip_validation(true) }.build().unwrap()
2032    }
2033
2034    /// Creates an `ArrayData`, consuming `self`
2035    ///
2036    /// # Safety
2037    ///
2038    /// By default the underlying buffers are checked to ensure they are valid
2039    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2040    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2041    /// not valid, undefined behavior will result.
2042    pub fn build(self) -> Result<ArrayData, ArrowError> {
2043        let Self {
2044            data_type,
2045            len,
2046            null_count,
2047            null_bit_buffer,
2048            nulls,
2049            offset,
2050            buffers,
2051            child_data,
2052            align_buffers,
2053            skip_validation,
2054        } = self;
2055
2056        let nulls = nulls
2057            .or_else(|| {
2058                let buffer = null_bit_buffer?;
2059                let buffer = BooleanBuffer::new(buffer, offset, len);
2060                Some(match null_count {
2061                    Some(n) => {
2062                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2063                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2064                    }
2065                    None => NullBuffer::new(buffer),
2066                })
2067            })
2068            .filter(|b| b.null_count() != 0);
2069
2070        let mut data = ArrayData {
2071            data_type,
2072            len,
2073            offset,
2074            buffers,
2075            child_data,
2076            nulls,
2077        };
2078
2079        if align_buffers {
2080            data.align_buffers();
2081        }
2082
2083        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2084        if !skip_validation.get() || cfg!(feature = "force_validate") {
2085            data.validate_data()?;
2086        }
2087        Ok(data)
2088    }
2089
2090    /// Creates an array data, validating all inputs, and aligning any buffers
2091    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2092    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2093        self.align_buffers(true).build()
2094    }
2095
2096    /// Ensure that all buffers are aligned, copying data if necessary
2097    ///
2098    /// Rust requires that arrays are aligned to their corresponding primitive,
2099    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2100    ///
2101    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2102    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2103    ///
2104    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2105    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2106    /// when necessary, making it useful when interacting with buffers produced by other systems,
2107    /// e.g. IPC or FFI.
2108    ///
2109    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2110    /// insufficiently aligned buffers.
2111    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2112        self.align_buffers = align_buffers;
2113        self
2114    }
2115
2116    /// Skips validation of the data.
2117    ///
2118    /// If this flag is enabled, `[Self::build`] will skip validation of the
2119    /// data
2120    ///
2121    /// If this flag is not enabled, `[Self::build`] will validate that all
2122    /// buffers are valid and will return an error if any data is invalid.
2123    /// Validation can be expensive.
2124    ///
2125    /// # Safety
2126    ///
2127    /// If validation is skipped, the buffers must form a valid Arrow array,
2128    /// otherwise undefined behavior will result
2129    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2130        unsafe {
2131            self.skip_validation.set(skip_validation);
2132        }
2133        self
2134    }
2135}
2136
2137impl From<ArrayData> for ArrayDataBuilder {
2138    fn from(d: ArrayData) -> Self {
2139        Self {
2140            data_type: d.data_type,
2141            len: d.len,
2142            offset: d.offset,
2143            buffers: d.buffers,
2144            child_data: d.child_data,
2145            nulls: d.nulls,
2146            null_bit_buffer: None,
2147            null_count: None,
2148            align_buffers: false,
2149            skip_validation: UnsafeFlag::new(),
2150        }
2151    }
2152}
2153
2154#[cfg(test)]
2155mod tests {
2156    use super::*;
2157    use arrow_schema::{Field, Fields};
2158
2159    // See arrow/tests/array_data_validation.rs for test of array validation
2160
2161    /// returns a buffer initialized with some constant value for tests
2162    fn make_i32_buffer(n: usize) -> Buffer {
2163        Buffer::from_slice_ref(vec![42i32; n])
2164    }
2165
2166    /// returns a buffer initialized with some constant value for tests
2167    fn make_f32_buffer(n: usize) -> Buffer {
2168        Buffer::from_slice_ref(vec![42f32; n])
2169    }
2170
2171    #[test]
2172    fn test_builder() {
2173        // Buffer needs to be at least 25 long
2174        let v = (0..25).collect::<Vec<i32>>();
2175        let b1 = Buffer::from_slice_ref(&v);
2176        let arr_data = ArrayData::builder(DataType::Int32)
2177            .len(20)
2178            .offset(5)
2179            .add_buffer(b1)
2180            .null_bit_buffer(Some(Buffer::from([
2181                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2182            ])))
2183            .build()
2184            .unwrap();
2185
2186        assert_eq!(20, arr_data.len());
2187        assert_eq!(10, arr_data.null_count());
2188        assert_eq!(5, arr_data.offset());
2189        assert_eq!(1, arr_data.buffers().len());
2190        assert_eq!(
2191            Buffer::from_slice_ref(&v).as_slice(),
2192            arr_data.buffers()[0].as_slice()
2193        );
2194    }
2195
2196    #[test]
2197    fn test_builder_with_child_data() {
2198        let child_arr_data = ArrayData::try_new(
2199            DataType::Int32,
2200            5,
2201            None,
2202            0,
2203            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2204            vec![],
2205        )
2206        .unwrap();
2207
2208        let field = Arc::new(Field::new("x", DataType::Int32, true));
2209        let data_type = DataType::Struct(vec![field].into());
2210
2211        let arr_data = ArrayData::builder(data_type)
2212            .len(5)
2213            .offset(0)
2214            .add_child_data(child_arr_data.clone())
2215            .build()
2216            .unwrap();
2217
2218        assert_eq!(5, arr_data.len());
2219        assert_eq!(1, arr_data.child_data().len());
2220        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2221    }
2222
2223    #[test]
2224    fn test_null_count() {
2225        let mut bit_v: [u8; 2] = [0; 2];
2226        bit_util::set_bit(&mut bit_v, 0);
2227        bit_util::set_bit(&mut bit_v, 3);
2228        bit_util::set_bit(&mut bit_v, 10);
2229        let arr_data = ArrayData::builder(DataType::Int32)
2230            .len(16)
2231            .add_buffer(make_i32_buffer(16))
2232            .null_bit_buffer(Some(Buffer::from(bit_v)))
2233            .build()
2234            .unwrap();
2235        assert_eq!(13, arr_data.null_count());
2236
2237        // Test with offset
2238        let mut bit_v: [u8; 2] = [0; 2];
2239        bit_util::set_bit(&mut bit_v, 0);
2240        bit_util::set_bit(&mut bit_v, 3);
2241        bit_util::set_bit(&mut bit_v, 10);
2242        let arr_data = ArrayData::builder(DataType::Int32)
2243            .len(12)
2244            .offset(2)
2245            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2246            .null_bit_buffer(Some(Buffer::from(bit_v)))
2247            .build()
2248            .unwrap();
2249        assert_eq!(10, arr_data.null_count());
2250    }
2251
2252    #[test]
2253    fn test_null_buffer_ref() {
2254        let mut bit_v: [u8; 2] = [0; 2];
2255        bit_util::set_bit(&mut bit_v, 0);
2256        bit_util::set_bit(&mut bit_v, 3);
2257        bit_util::set_bit(&mut bit_v, 10);
2258        let arr_data = ArrayData::builder(DataType::Int32)
2259            .len(16)
2260            .add_buffer(make_i32_buffer(16))
2261            .null_bit_buffer(Some(Buffer::from(bit_v)))
2262            .build()
2263            .unwrap();
2264        assert!(arr_data.nulls().is_some());
2265        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2266    }
2267
2268    #[test]
2269    fn test_slice() {
2270        let mut bit_v: [u8; 2] = [0; 2];
2271        bit_util::set_bit(&mut bit_v, 0);
2272        bit_util::set_bit(&mut bit_v, 3);
2273        bit_util::set_bit(&mut bit_v, 10);
2274        let data = ArrayData::builder(DataType::Int32)
2275            .len(16)
2276            .add_buffer(make_i32_buffer(16))
2277            .null_bit_buffer(Some(Buffer::from(bit_v)))
2278            .build()
2279            .unwrap();
2280        let new_data = data.slice(1, 15);
2281        assert_eq!(data.len() - 1, new_data.len());
2282        assert_eq!(1, new_data.offset());
2283        assert_eq!(data.null_count(), new_data.null_count());
2284
2285        // slice of a slice (removes one null)
2286        let new_data = new_data.slice(1, 14);
2287        assert_eq!(data.len() - 2, new_data.len());
2288        assert_eq!(2, new_data.offset());
2289        assert_eq!(data.null_count() - 1, new_data.null_count());
2290    }
2291
2292    #[test]
2293    fn test_equality() {
2294        let int_data = ArrayData::builder(DataType::Int32)
2295            .len(1)
2296            .add_buffer(make_i32_buffer(1))
2297            .build()
2298            .unwrap();
2299
2300        let float_data = ArrayData::builder(DataType::Float32)
2301            .len(1)
2302            .add_buffer(make_f32_buffer(1))
2303            .build()
2304            .unwrap();
2305        assert_ne!(int_data, float_data);
2306        assert!(!int_data.ptr_eq(&float_data));
2307        assert!(int_data.ptr_eq(&int_data));
2308
2309        #[allow(clippy::redundant_clone)]
2310        let int_data_clone = int_data.clone();
2311        assert_eq!(int_data, int_data_clone);
2312        assert!(int_data.ptr_eq(&int_data_clone));
2313        assert!(int_data_clone.ptr_eq(&int_data));
2314
2315        let int_data_slice = int_data_clone.slice(1, 0);
2316        assert!(int_data_slice.ptr_eq(&int_data_slice));
2317        assert!(!int_data.ptr_eq(&int_data_slice));
2318        assert!(!int_data_slice.ptr_eq(&int_data));
2319
2320        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2321        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2322        let string_data = ArrayData::try_new(
2323            DataType::Utf8,
2324            3,
2325            Some(Buffer::from_iter(vec![true, false, true])),
2326            0,
2327            vec![offsets_buffer, data_buffer],
2328            vec![],
2329        )
2330        .unwrap();
2331
2332        assert_ne!(float_data, string_data);
2333        assert!(!float_data.ptr_eq(&string_data));
2334
2335        assert!(string_data.ptr_eq(&string_data));
2336
2337        #[allow(clippy::redundant_clone)]
2338        let string_data_cloned = string_data.clone();
2339        assert!(string_data_cloned.ptr_eq(&string_data));
2340        assert!(string_data.ptr_eq(&string_data_cloned));
2341
2342        let string_data_slice = string_data.slice(1, 2);
2343        assert!(string_data_slice.ptr_eq(&string_data_slice));
2344        assert!(!string_data_slice.ptr_eq(&string_data))
2345    }
2346
2347    #[test]
2348    fn test_slice_memory_size() {
2349        let mut bit_v: [u8; 2] = [0; 2];
2350        bit_util::set_bit(&mut bit_v, 0);
2351        bit_util::set_bit(&mut bit_v, 3);
2352        bit_util::set_bit(&mut bit_v, 10);
2353        let data = ArrayData::builder(DataType::Int32)
2354            .len(16)
2355            .add_buffer(make_i32_buffer(16))
2356            .null_bit_buffer(Some(Buffer::from(bit_v)))
2357            .build()
2358            .unwrap();
2359        let new_data = data.slice(1, 14);
2360        assert_eq!(
2361            data.get_slice_memory_size().unwrap() - 8,
2362            new_data.get_slice_memory_size().unwrap()
2363        );
2364        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2365        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2366        let string_data = ArrayData::try_new(
2367            DataType::Utf8,
2368            3,
2369            Some(Buffer::from_iter(vec![true, false, true])),
2370            0,
2371            vec![offsets_buffer, data_buffer],
2372            vec![],
2373        )
2374        .unwrap();
2375        let string_data_slice = string_data.slice(1, 2);
2376        //4 bytes of offset and 2 bytes of data reduced by slicing.
2377        assert_eq!(
2378            string_data.get_slice_memory_size().unwrap() - 6,
2379            string_data_slice.get_slice_memory_size().unwrap()
2380        );
2381    }
2382
2383    #[test]
2384    fn test_count_nulls() {
2385        let buffer = Buffer::from([0b00010110, 0b10011111]);
2386        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2387        let count = count_nulls(Some(&buffer), 0, 16);
2388        assert_eq!(count, 7);
2389
2390        let count = count_nulls(Some(&buffer), 4, 8);
2391        assert_eq!(count, 3);
2392    }
2393
2394    #[test]
2395    fn test_contains_nulls() {
2396        let buffer: Buffer =
2397            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2398        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2399        assert!(contains_nulls(Some(&buffer), 0, 6));
2400        assert!(contains_nulls(Some(&buffer), 0, 3));
2401        assert!(!contains_nulls(Some(&buffer), 3, 2));
2402        assert!(!contains_nulls(Some(&buffer), 0, 0));
2403    }
2404
2405    #[test]
2406    fn test_alignment() {
2407        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2408        let sliced = buffer.slice(1);
2409
2410        let mut data = ArrayData {
2411            data_type: DataType::Int32,
2412            len: 0,
2413            offset: 0,
2414            buffers: vec![buffer],
2415            child_data: vec![],
2416            nulls: None,
2417        };
2418        data.validate_full().unwrap();
2419
2420        // break alignment in data
2421        data.buffers[0] = sliced;
2422        let err = data.validate().unwrap_err();
2423
2424        assert_eq!(
2425            err.to_string(),
2426            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2427        );
2428
2429        data.align_buffers();
2430        data.validate_full().unwrap();
2431    }
2432
2433    #[test]
2434    fn test_alignment_struct() {
2435        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2436        let sliced = buffer.slice(1);
2437
2438        let child_data = ArrayData {
2439            data_type: DataType::Int32,
2440            len: 0,
2441            offset: 0,
2442            buffers: vec![buffer],
2443            child_data: vec![],
2444            nulls: None,
2445        };
2446
2447        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2448        let mut data = ArrayData {
2449            data_type: schema,
2450            len: 0,
2451            offset: 0,
2452            buffers: vec![],
2453            child_data: vec![child_data],
2454            nulls: None,
2455        };
2456        data.validate_full().unwrap();
2457
2458        // break alignment in child data
2459        data.child_data[0].buffers[0] = sliced;
2460        let err = data.validate().unwrap_err();
2461
2462        assert_eq!(
2463            err.to_string(),
2464            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2465        );
2466
2467        data.align_buffers();
2468        data.validate_full().unwrap();
2469    }
2470
2471    #[test]
2472    fn test_null_view_types() {
2473        let array_len = 32;
2474        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2475        assert_eq!(array.len(), array_len);
2476        for i in 0..array.len() {
2477            assert!(array.is_null(i));
2478        }
2479
2480        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2481        assert_eq!(array.len(), array_len);
2482        for i in 0..array.len() {
2483            assert!(array.is_null(i));
2484        }
2485
2486        let array = ArrayData::new_null(
2487            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2488            array_len,
2489        );
2490        assert_eq!(array.len(), array_len);
2491        for i in 0..array.len() {
2492            assert!(array.is_null(i));
2493        }
2494
2495        let array = ArrayData::new_null(
2496            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2497            array_len,
2498        );
2499        assert_eq!(array.len(), array_len);
2500        for i in 0..array.len() {
2501            assert!(array.is_null(i));
2502        }
2503    }
2504}