arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257    /// Create a new ArrayData instance;
258    ///
259    /// If `null_count` is not specified, the number of nulls in
260    /// null_bit_buffer is calculated.
261    ///
262    /// If the number of nulls is 0 then the null_bit_buffer
263    /// is set to `None`.
264    ///
265    /// # Safety
266    ///
267    /// The input values *must* form a valid Arrow array for
268    /// `data_type`, or undefined behavior can result.
269    ///
270    /// Note: This is a low level API and most users of the arrow
271    /// crate should create arrays using the methods in the `array`
272    /// module.
273    pub unsafe fn new_unchecked(
274        data_type: DataType,
275        len: usize,
276        null_count: Option<usize>,
277        null_bit_buffer: Option<Buffer>,
278        offset: usize,
279        buffers: Vec<Buffer>,
280        child_data: Vec<ArrayData>,
281    ) -> Self {
282        let mut skip_validation = UnsafeFlag::new();
283        // SAFETY: caller responsible for ensuring data is valid
284        unsafe { skip_validation.set(true) };
285
286        ArrayDataBuilder {
287            data_type,
288            len,
289            null_count,
290            null_bit_buffer,
291            nulls: None,
292            offset,
293            buffers,
294            child_data,
295            align_buffers: false,
296            skip_validation,
297        }
298        .build()
299        .unwrap()
300    }
301
302    /// Create a new ArrayData, validating that the provided buffers form a valid
303    /// Arrow array of the specified data type.
304    ///
305    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306    /// is set to `None`.
307    ///
308    /// Internally this calls through to [`Self::validate_data`]
309    ///
310    /// Note: This is a low level API and most users of the arrow crate should create
311    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312    pub fn try_new(
313        data_type: DataType,
314        len: usize,
315        null_bit_buffer: Option<Buffer>,
316        offset: usize,
317        buffers: Vec<Buffer>,
318        child_data: Vec<ArrayData>,
319    ) -> Result<Self, ArrowError> {
320        // we must check the length of `null_bit_buffer` first
321        // because we use this buffer to calculate `null_count`
322        // in `Self::new_unchecked`.
323        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324            let needed_len = bit_util::ceil(len + offset, 8);
325            if null_bit_buffer.len() < needed_len {
326                return Err(ArrowError::InvalidArgumentError(format!(
327                    "null_bit_buffer size too small. got {} needed {}",
328                    null_bit_buffer.len(),
329                    needed_len
330                )));
331            }
332        }
333        // Safety justification: `validate_full` is called below
334        let new_self = unsafe {
335            Self::new_unchecked(
336                data_type,
337                len,
338                None,
339                null_bit_buffer,
340                offset,
341                buffers,
342                child_data,
343            )
344        };
345
346        // As the data is not trusted, do a full validation of its contents
347        // We don't need to validate children as we can assume that the
348        // [`ArrayData`] in `child_data` have already been validated through
349        // a call to `ArrayData::try_new` or created using unsafe
350        new_self.validate_data()?;
351        Ok(new_self)
352    }
353
354    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
355    #[inline]
356    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357        ArrayDataBuilder::new(data_type)
358    }
359
360    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
361    #[inline]
362    pub const fn data_type(&self) -> &DataType {
363        &self.data_type
364    }
365
366    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
367    pub fn buffers(&self) -> &[Buffer] {
368        &self.buffers
369    }
370
371    /// Returns a slice of children [`ArrayData`]. This will be non
372    /// empty for type such as lists and structs.
373    pub fn child_data(&self) -> &[ArrayData] {
374        &self.child_data[..]
375    }
376
377    /// Returns whether the element at index `i` is null
378    #[inline]
379    pub fn is_null(&self, i: usize) -> bool {
380        match &self.nulls {
381            Some(v) => v.is_null(i),
382            None => false,
383        }
384    }
385
386    /// Returns a reference to the null buffer of this [`ArrayData`] if any
387    ///
388    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
389    #[inline]
390    pub fn nulls(&self) -> Option<&NullBuffer> {
391        self.nulls.as_ref()
392    }
393
394    /// Returns whether the element at index `i` is not null
395    #[inline]
396    pub fn is_valid(&self, i: usize) -> bool {
397        !self.is_null(i)
398    }
399
400    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
401    #[inline]
402    pub const fn len(&self) -> usize {
403        self.len
404    }
405
406    /// Returns whether this [`ArrayData`] is empty
407    #[inline]
408    pub const fn is_empty(&self) -> bool {
409        self.len == 0
410    }
411
412    /// Returns the offset of this [`ArrayData`]
413    #[inline]
414    pub const fn offset(&self) -> usize {
415        self.offset
416    }
417
418    /// Returns the total number of nulls in this array
419    #[inline]
420    pub fn null_count(&self) -> usize {
421        self.nulls
422            .as_ref()
423            .map(|x| x.null_count())
424            .unwrap_or_default()
425    }
426
427    /// Returns the total number of bytes of memory occupied by the
428    /// buffers owned by this [`ArrayData`] and all of its
429    /// children. (See also diagram on [`ArrayData`]).
430    ///
431    /// Note that this [`ArrayData`] may only refer to a subset of the
432    /// data in the underlying [`Buffer`]s (due to `offset` and
433    /// `length`), but the size returned includes the entire size of
434    /// the buffers.
435    ///
436    /// If multiple [`ArrayData`]s refer to the same underlying
437    /// [`Buffer`]s they will both report the same size.
438    pub fn get_buffer_memory_size(&self) -> usize {
439        let mut size = 0;
440        for buffer in &self.buffers {
441            size += buffer.capacity();
442        }
443        if let Some(bitmap) = &self.nulls {
444            size += bitmap.buffer().capacity()
445        }
446        for child in &self.child_data {
447            size += child.get_buffer_memory_size();
448        }
449        size
450    }
451
452    /// Returns the total number of the bytes of memory occupied by
453    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
454    ///
455    /// This is approximately the number of bytes if a new
456    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
457    /// exactly the data needed.
458    ///
459    /// For example, a [`DataType::Int64`] with `100` elements,
460    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
461    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
462    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
463    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
464    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465        let mut result: usize = 0;
466        let layout = layout(&self.data_type);
467
468        for spec in layout.buffers.iter() {
469            match spec {
470                BufferSpec::FixedWidth { byte_width, .. } => {
471                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472                        ArrowError::ComputeError(
473                            "Integer overflow computing buffer size".to_string(),
474                        )
475                    })?;
476                    result += buffer_size;
477                }
478                BufferSpec::VariableWidth => {
479                    let buffer_len = match self.data_type {
480                        DataType::Utf8 | DataType::Binary => {
481                            let offsets = self.typed_offsets::<i32>()?;
482                            (offsets[self.len] - offsets[0]) as usize
483                        }
484                        DataType::LargeUtf8 | DataType::LargeBinary => {
485                            let offsets = self.typed_offsets::<i64>()?;
486                            (offsets[self.len] - offsets[0]) as usize
487                        }
488                        _ => {
489                            return Err(ArrowError::NotYetImplemented(format!(
490                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491                                self.data_type
492                            )));
493                        }
494                    };
495                    result += buffer_len;
496                }
497                BufferSpec::BitMap => {
498                    let buffer_size = bit_util::ceil(self.len, 8);
499                    result += buffer_size;
500                }
501                BufferSpec::AlwaysNull => {
502                    // Nothing to do
503                }
504            }
505        }
506
507        if self.nulls().is_some() {
508            result += bit_util::ceil(self.len, 8);
509        }
510
511        for child in &self.child_data {
512            result += child.get_slice_memory_size()?;
513        }
514        Ok(result)
515    }
516
517    /// Returns the total number of bytes of memory occupied
518    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
519    /// children. (See also diagram on [`ArrayData`]).
520    ///
521    /// Equivalent to:
522    ///  `size_of_val(self)` +
523    ///  [`Self::get_buffer_memory_size`] +
524    ///  `size_of_val(child)` for all children
525    pub fn get_array_memory_size(&self) -> usize {
526        let mut size = mem::size_of_val(self);
527
528        // Calculate rest of the fields top down which contain actual data
529        for buffer in &self.buffers {
530            size += mem::size_of::<Buffer>();
531            size += buffer.capacity();
532        }
533        if let Some(nulls) = &self.nulls {
534            size += nulls.buffer().capacity();
535        }
536        for child in &self.child_data {
537            size += child.get_array_memory_size();
538        }
539
540        size
541    }
542
543    /// Creates a zero-copy slice of itself. This creates a new
544    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
545    /// different offset and len
546    ///
547    /// # Panics
548    ///
549    /// Panics if `offset + length > self.len()`.
550    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551        assert!((offset + length) <= self.len());
552
553        if let DataType::Struct(_) = self.data_type() {
554            // Slice into children
555            let new_offset = self.offset + offset;
556            ArrayData {
557                data_type: self.data_type().clone(),
558                len: length,
559                offset: new_offset,
560                buffers: self.buffers.clone(),
561                // Slice child data, to propagate offsets down to them
562                child_data: self
563                    .child_data()
564                    .iter()
565                    .map(|data| data.slice(offset, length))
566                    .collect(),
567                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568            }
569        } else {
570            let mut new_data = self.clone();
571
572            new_data.len = length;
573            new_data.offset = offset + self.offset;
574            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
575
576            new_data
577        }
578    }
579
580    /// Returns the `buffer` as a slice of type `T` starting at self.offset
581    ///
582    /// # Panics
583    /// This function panics if:
584    /// * the buffer is not byte-aligned with type T, or
585    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
586    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587        &self.buffers()[buffer].typed_data()[self.offset..]
588    }
589
590    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
591    pub fn new_null(data_type: &DataType, len: usize) -> Self {
592        let bit_len = bit_util::ceil(len, 8);
593        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596            Some(width) => (vec![zeroed(width * len)], vec![], true),
597            None => match data_type {
598                DataType::Null => (vec![], vec![], false),
599                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600                DataType::Binary | DataType::Utf8 => {
601                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602                }
603                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604                DataType::LargeBinary | DataType::LargeUtf8 => {
605                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606                }
607                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608                DataType::List(f) | DataType::Map(f, _) => (
609                    vec![zeroed((len + 1) * 4)],
610                    vec![ArrayData::new_empty(f.data_type())],
611                    true,
612                ),
613                DataType::LargeList(f) => (
614                    vec![zeroed((len + 1) * 8)],
615                    vec![ArrayData::new_empty(f.data_type())],
616                    true,
617                ),
618                DataType::FixedSizeList(f, list_len) => (
619                    vec![],
620                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
621                    true,
622                ),
623                DataType::Struct(fields) => (
624                    vec![],
625                    fields
626                        .iter()
627                        .map(|f| Self::new_null(f.data_type(), len))
628                        .collect(),
629                    true,
630                ),
631                DataType::Dictionary(k, v) => (
632                    vec![zeroed(k.primitive_width().unwrap() * len)],
633                    vec![ArrayData::new_empty(v.as_ref())],
634                    true,
635                ),
636                DataType::Union(f, mode) => {
637                    let (id, _) = f.iter().next().unwrap();
638                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
639                    let buffers = match mode {
640                        UnionMode::Sparse => vec![ids],
641                        UnionMode::Dense => {
642                            let end_offset = i32::from_usize(len).unwrap();
643                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
644                        }
645                    };
646
647                    let children = f
648                        .iter()
649                        .enumerate()
650                        .map(|(idx, (_, f))| {
651                            if idx == 0 || *mode == UnionMode::Sparse {
652                                Self::new_null(f.data_type(), len)
653                            } else {
654                                Self::new_empty(f.data_type())
655                            }
656                        })
657                        .collect();
658
659                    (buffers, children, false)
660                }
661                DataType::RunEndEncoded(r, v) => {
662                    let runs = match r.data_type() {
663                        DataType::Int16 => {
664                            let i = i16::from_usize(len).expect("run overflow");
665                            Buffer::from_slice_ref([i])
666                        }
667                        DataType::Int32 => {
668                            let i = i32::from_usize(len).expect("run overflow");
669                            Buffer::from_slice_ref([i])
670                        }
671                        DataType::Int64 => {
672                            let i = i64::from_usize(len).expect("run overflow");
673                            Buffer::from_slice_ref([i])
674                        }
675                        dt => unreachable!("Invalid run ends data type {dt}"),
676                    };
677
678                    let builder = ArrayData::builder(r.data_type().clone())
679                        .len(1)
680                        .buffers(vec![runs]);
681
682                    // SAFETY:
683                    // Valid by construction
684                    let runs = unsafe { builder.build_unchecked() };
685                    (
686                        vec![],
687                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
688                        false,
689                    )
690                }
691                d => unreachable!("{d}"),
692            },
693        };
694
695        let mut builder = ArrayDataBuilder::new(data_type.clone())
696            .len(len)
697            .buffers(buffers)
698            .child_data(child_data);
699
700        if has_nulls {
701            builder = builder.nulls(Some(NullBuffer::new_null(len)))
702        }
703
704        // SAFETY:
705        // Data valid by construction
706        unsafe { builder.build_unchecked() }
707    }
708
709    /// Returns a new empty [ArrayData] valid for `data_type`.
710    pub fn new_empty(data_type: &DataType) -> Self {
711        Self::new_null(data_type, 0)
712    }
713
714    /// Verifies that the buffers meet the minimum alignment requirements for the data type
715    ///
716    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
717    ///
718    /// This can be useful for when interacting with data sent over IPC or FFI, that may
719    /// not meet the minimum alignment requirements
720    ///
721    /// This also aligns buffers of children data
722    pub fn align_buffers(&mut self) {
723        let layout = layout(&self.data_type);
724        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
725            if let BufferSpec::FixedWidth { alignment, .. } = spec {
726                if buffer.as_ptr().align_offset(*alignment) != 0 {
727                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
728                }
729            }
730        }
731        // align children data recursively
732        for data in self.child_data.iter_mut() {
733            data.align_buffers()
734        }
735    }
736
737    /// "cheap" validation of an `ArrayData`. Ensures buffers are
738    /// sufficiently sized to store `len` + `offset` total elements of
739    /// `data_type` and performs other inexpensive consistency checks.
740    ///
741    /// This check is "cheap" in the sense that it does not validate the
742    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
743    /// are within the bounds of the values buffer).
744    ///
745    /// See [ArrayData::validate_data] to validate fully the offset content
746    /// and the validity of utf8 data
747    pub fn validate(&self) -> Result<(), ArrowError> {
748        // Need at least this mich space in each buffer
749        let len_plus_offset = self.len + self.offset;
750
751        // Check that the data layout conforms to the spec
752        let layout = layout(&self.data_type);
753
754        if !layout.can_contain_null_mask && self.nulls.is_some() {
755            return Err(ArrowError::InvalidArgumentError(format!(
756                "Arrays of type {:?} cannot contain a null bitmask",
757                self.data_type,
758            )));
759        }
760
761        // Check data buffers length for view types and other types
762        if self.buffers.len() < layout.buffers.len()
763            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
764        {
765            return Err(ArrowError::InvalidArgumentError(format!(
766                "Expected {} buffers in array of type {:?}, got {}",
767                layout.buffers.len(),
768                self.data_type,
769                self.buffers.len(),
770            )));
771        }
772
773        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
774            match spec {
775                BufferSpec::FixedWidth {
776                    byte_width,
777                    alignment,
778                } => {
779                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
780
781                    if buffer.len() < min_buffer_size {
782                        return Err(ArrowError::InvalidArgumentError(format!(
783                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
784                            min_buffer_size,
785                            i,
786                            self.data_type,
787                            buffer.len()
788                        )));
789                    }
790
791                    let align_offset = buffer.as_ptr().align_offset(*alignment);
792                    if align_offset != 0 {
793                        return Err(ArrowError::InvalidArgumentError(format!(
794                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795                            self.data_type,
796                            align_offset.min(alignment - align_offset)
797                        )));
798                    }
799                }
800                BufferSpec::VariableWidth => {
801                    // not cheap to validate (need to look at the
802                    // data). Partially checked in validate_offsets
803                    // called below. Can check with `validate_full`
804                }
805                BufferSpec::BitMap => {
806                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
807                    if buffer.len() < min_buffer_size {
808                        return Err(ArrowError::InvalidArgumentError(format!(
809                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
810                            min_buffer_size,
811                            i,
812                            self.data_type,
813                            buffer.len()
814                        )));
815                    }
816                }
817                BufferSpec::AlwaysNull => {
818                    // Nothing to validate
819                }
820            }
821        }
822
823        // check null bit buffer size
824        if let Some(nulls) = self.nulls() {
825            if nulls.null_count() > self.len {
826                return Err(ArrowError::InvalidArgumentError(format!(
827                    "null_count {} for an array exceeds length of {} elements",
828                    nulls.null_count(),
829                    self.len
830                )));
831            }
832
833            let actual_len = nulls.validity().len();
834            let needed_len = bit_util::ceil(len_plus_offset, 8);
835            if actual_len < needed_len {
836                return Err(ArrowError::InvalidArgumentError(format!(
837                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
838                )));
839            }
840
841            if nulls.len() != self.len {
842                return Err(ArrowError::InvalidArgumentError(format!(
843                    "null buffer incorrect size. got {} expected {}",
844                    nulls.len(),
845                    self.len
846                )));
847            }
848        }
849
850        self.validate_child_data()?;
851
852        // Additional Type specific checks
853        match &self.data_type {
854            DataType::Utf8 | DataType::Binary => {
855                self.validate_offsets::<i32>(self.buffers[1].len())?;
856            }
857            DataType::LargeUtf8 | DataType::LargeBinary => {
858                self.validate_offsets::<i64>(self.buffers[1].len())?;
859            }
860            DataType::Dictionary(key_type, _value_type) => {
861                // At the moment, constructing a DictionaryArray will also check this
862                if !DataType::is_dictionary_key_type(key_type) {
863                    return Err(ArrowError::InvalidArgumentError(format!(
864                        "Dictionary key type must be integer, but was {key_type}"
865                    )));
866                }
867            }
868            DataType::RunEndEncoded(run_ends_type, _) => {
869                if run_ends_type.is_nullable() {
870                    return Err(ArrowError::InvalidArgumentError(
871                        "The nullable should be set to false for the field defining run_ends array.".to_string()
872                    ));
873                }
874                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
875                    return Err(ArrowError::InvalidArgumentError(format!(
876                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
877                        run_ends_type.data_type()
878                    )));
879                }
880            }
881            _ => {}
882        };
883
884        Ok(())
885    }
886
887    /// Returns a reference to the data in `buffer` as a typed slice
888    /// (typically `&[i32]` or `&[i64]`) after validating. The
889    /// returned slice is guaranteed to have at least `self.len + 1`
890    /// entries.
891    ///
892    /// For an empty array, the `buffer` can also be empty.
893    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
894        // An empty list-like array can have 0 offsets
895        if self.len == 0 && self.buffers[0].is_empty() {
896            return Ok(&[]);
897        }
898
899        self.typed_buffer(0, self.len + 1)
900    }
901
902    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
903    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
904        &self,
905        idx: usize,
906        len: usize,
907    ) -> Result<&[T], ArrowError> {
908        let buffer = &self.buffers[idx];
909
910        let required_len = (len + self.offset) * mem::size_of::<T>();
911
912        if buffer.len() < required_len {
913            return Err(ArrowError::InvalidArgumentError(format!(
914                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
915                idx,
916                self.data_type,
917                required_len,
918                buffer.len()
919            )));
920        }
921
922        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
923    }
924
925    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
926    /// offsets (of type T) into some other buffer of `values_length` bytes long
927    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
928        &self,
929        values_length: usize,
930    ) -> Result<(), ArrowError> {
931        // Justification: buffer size was validated above
932        let offsets = self.typed_offsets::<T>()?;
933        if offsets.is_empty() {
934            return Ok(());
935        }
936
937        let first_offset = offsets[0].to_usize().ok_or_else(|| {
938            ArrowError::InvalidArgumentError(format!(
939                "Error converting offset[0] ({}) to usize for {}",
940                offsets[0], self.data_type
941            ))
942        })?;
943
944        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
945            ArrowError::InvalidArgumentError(format!(
946                "Error converting offset[{}] ({}) to usize for {}",
947                self.len, offsets[self.len], self.data_type
948            ))
949        })?;
950
951        if first_offset > values_length {
952            return Err(ArrowError::InvalidArgumentError(format!(
953                "First offset {} of {} is larger than values length {}",
954                first_offset, self.data_type, values_length,
955            )));
956        }
957
958        if last_offset > values_length {
959            return Err(ArrowError::InvalidArgumentError(format!(
960                "Last offset {} of {} is larger than values length {}",
961                last_offset, self.data_type, values_length,
962            )));
963        }
964
965        if first_offset > last_offset {
966            return Err(ArrowError::InvalidArgumentError(format!(
967                "First offset {} in {} is smaller than last offset {}",
968                first_offset, self.data_type, last_offset,
969            )));
970        }
971
972        Ok(())
973    }
974
975    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
976    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
977    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
978        &self,
979        values_length: usize,
980    ) -> Result<(), ArrowError> {
981        let offsets: &[T] = self.typed_buffer(0, self.len)?;
982        let sizes: &[T] = self.typed_buffer(1, self.len)?;
983        for i in 0..values_length {
984            let size = sizes[i].to_usize().ok_or_else(|| {
985                ArrowError::InvalidArgumentError(format!(
986                    "Error converting size[{}] ({}) to usize for {}",
987                    i, sizes[i], self.data_type
988                ))
989            })?;
990            let offset = offsets[i].to_usize().ok_or_else(|| {
991                ArrowError::InvalidArgumentError(format!(
992                    "Error converting offset[{}] ({}) to usize for {}",
993                    i, offsets[i], self.data_type
994                ))
995            })?;
996            if size
997                .checked_add(offset)
998                .expect("Offset and size have exceeded the usize boundary")
999                > values_length
1000            {
1001                return Err(ArrowError::InvalidArgumentError(format!(
1002                    "Size {} at index {} is larger than the remaining values for {}",
1003                    size, i, self.data_type
1004                )));
1005            }
1006        }
1007        Ok(())
1008    }
1009
1010    /// Validates the layout of `child_data` ArrayData structures
1011    fn validate_child_data(&self) -> Result<(), ArrowError> {
1012        match &self.data_type {
1013            DataType::List(field) | DataType::Map(field, _) => {
1014                let values_data = self.get_single_valid_child_data(field.data_type())?;
1015                self.validate_offsets::<i32>(values_data.len)?;
1016                Ok(())
1017            }
1018            DataType::LargeList(field) => {
1019                let values_data = self.get_single_valid_child_data(field.data_type())?;
1020                self.validate_offsets::<i64>(values_data.len)?;
1021                Ok(())
1022            }
1023            DataType::ListView(field) => {
1024                let values_data = self.get_single_valid_child_data(field.data_type())?;
1025                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1026                Ok(())
1027            }
1028            DataType::LargeListView(field) => {
1029                let values_data = self.get_single_valid_child_data(field.data_type())?;
1030                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1031                Ok(())
1032            }
1033            DataType::FixedSizeList(field, list_size) => {
1034                let values_data = self.get_single_valid_child_data(field.data_type())?;
1035
1036                let list_size: usize = (*list_size).try_into().map_err(|_| {
1037                    ArrowError::InvalidArgumentError(format!(
1038                        "{} has a negative list_size {}",
1039                        self.data_type, list_size
1040                    ))
1041                })?;
1042
1043                let expected_values_len = self.len
1044                    .checked_mul(list_size)
1045                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1046
1047                if values_data.len < expected_values_len {
1048                    return Err(ArrowError::InvalidArgumentError(format!(
1049                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1050                        values_data.len, self.len, list_size, self.data_type
1051                    )));
1052                }
1053
1054                Ok(())
1055            }
1056            DataType::Struct(fields) => {
1057                self.validate_num_child_data(fields.len())?;
1058                for (i, field) in fields.iter().enumerate() {
1059                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1060
1061                    // Ensure child field has sufficient size
1062                    if field_data.len < self.len {
1063                        return Err(ArrowError::InvalidArgumentError(format!(
1064                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1065                            self.data_type,
1066                            i,
1067                            field.name(),
1068                            field_data.len,
1069                            self.len
1070                        )));
1071                    }
1072                }
1073                Ok(())
1074            }
1075            DataType::RunEndEncoded(run_ends_field, values_field) => {
1076                self.validate_num_child_data(2)?;
1077                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1078                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1079                if run_ends_data.len != values_data.len {
1080                    return Err(ArrowError::InvalidArgumentError(format!(
1081                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1082                        run_ends_data.len, values_data.len
1083                    )));
1084                }
1085                if run_ends_data.nulls.is_some() {
1086                    return Err(ArrowError::InvalidArgumentError(
1087                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1088                    ));
1089                }
1090                Ok(())
1091            }
1092            DataType::Union(fields, mode) => {
1093                self.validate_num_child_data(fields.len())?;
1094
1095                for (i, (_, field)) in fields.iter().enumerate() {
1096                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1097
1098                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1099                        return Err(ArrowError::InvalidArgumentError(format!(
1100                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1101                            i,
1102                            field_data.len,
1103                            self.len + self.offset
1104                        )));
1105                    }
1106                }
1107                Ok(())
1108            }
1109            DataType::Dictionary(_key_type, value_type) => {
1110                self.get_single_valid_child_data(value_type)?;
1111                Ok(())
1112            }
1113            _ => {
1114                // other types do not have child data
1115                if !self.child_data.is_empty() {
1116                    return Err(ArrowError::InvalidArgumentError(format!(
1117                        "Expected no child arrays for type {} but got {}",
1118                        self.data_type,
1119                        self.child_data.len()
1120                    )));
1121                }
1122                Ok(())
1123            }
1124        }
1125    }
1126
1127    /// Ensures that this array data has a single child_data with the
1128    /// expected type, and calls `validate()` on it. Returns a
1129    /// reference to that child_data
1130    fn get_single_valid_child_data(
1131        &self,
1132        expected_type: &DataType,
1133    ) -> Result<&ArrayData, ArrowError> {
1134        self.validate_num_child_data(1)?;
1135        self.get_valid_child_data(0, expected_type)
1136    }
1137
1138    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1139    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1140        if self.child_data.len() != expected_len {
1141            Err(ArrowError::InvalidArgumentError(format!(
1142                "Value data for {} should contain {} child data array(s), had {}",
1143                self.data_type,
1144                expected_len,
1145                self.child_data.len()
1146            )))
1147        } else {
1148            Ok(())
1149        }
1150    }
1151
1152    /// Ensures that `child_data[i]` has the expected type, calls
1153    /// `validate()` on it, and returns a reference to that child_data
1154    fn get_valid_child_data(
1155        &self,
1156        i: usize,
1157        expected_type: &DataType,
1158    ) -> Result<&ArrayData, ArrowError> {
1159        let values_data = self.child_data.get(i).ok_or_else(|| {
1160            ArrowError::InvalidArgumentError(format!(
1161                "{} did not have enough child arrays. Expected at least {} but had only {}",
1162                self.data_type,
1163                i + 1,
1164                self.child_data.len()
1165            ))
1166        })?;
1167
1168        if expected_type != &values_data.data_type {
1169            return Err(ArrowError::InvalidArgumentError(format!(
1170                "Child type mismatch for {}. Expected {} but child data had {}",
1171                self.data_type, expected_type, values_data.data_type
1172            )));
1173        }
1174
1175        values_data.validate()?;
1176        Ok(values_data)
1177    }
1178
1179    /// Validate that the data contained within this [`ArrayData`] is valid
1180    ///
1181    /// 1. Null count is correct
1182    /// 2. All offsets are valid
1183    /// 3. All String data is valid UTF-8
1184    /// 4. All dictionary offsets are valid
1185    ///
1186    /// Internally this calls:
1187    ///
1188    /// * [`Self::validate`]
1189    /// * [`Self::validate_nulls`]
1190    /// * [`Self::validate_values`]
1191    ///
1192    /// Note: this does not recurse into children, for a recursive variant
1193    /// see [`Self::validate_full`]
1194    pub fn validate_data(&self) -> Result<(), ArrowError> {
1195        self.validate()?;
1196
1197        self.validate_nulls()?;
1198        self.validate_values()?;
1199        Ok(())
1200    }
1201
1202    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1203    ///
1204    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1205    /// and all its children recursively
1206    pub fn validate_full(&self) -> Result<(), ArrowError> {
1207        self.validate_data()?;
1208        // validate all children recursively
1209        self.child_data
1210            .iter()
1211            .enumerate()
1212            .try_for_each(|(i, child_data)| {
1213                child_data.validate_full().map_err(|e| {
1214                    ArrowError::InvalidArgumentError(format!(
1215                        "{} child #{} invalid: {}",
1216                        self.data_type, i, e
1217                    ))
1218                })
1219            })?;
1220        Ok(())
1221    }
1222
1223    /// Validates the values stored within this [`ArrayData`] are valid
1224    /// without recursing into child [`ArrayData`]
1225    ///
1226    /// Does not (yet) check
1227    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1228    /// 2. the the null count is correct and that any
1229    /// 3. nullability requirements of its children are correct
1230    ///
1231    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1232    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1233        if let Some(nulls) = &self.nulls {
1234            let actual = nulls.len() - nulls.inner().count_set_bits();
1235            if actual != nulls.null_count() {
1236                return Err(ArrowError::InvalidArgumentError(format!(
1237                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1238                    nulls.null_count(),
1239                    actual
1240                )));
1241            }
1242        }
1243
1244        // In general non-nullable children should not contain nulls, however, for certain
1245        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1246        // space in the child. As such we permit nulls in the children in the corresponding
1247        // positions for such types
1248        match &self.data_type {
1249            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1250                if !f.is_nullable() {
1251                    self.validate_non_nullable(None, &self.child_data[0])?
1252                }
1253            }
1254            DataType::FixedSizeList(field, len) => {
1255                let child = &self.child_data[0];
1256                if !field.is_nullable() {
1257                    match &self.nulls {
1258                        Some(nulls) => {
1259                            let element_len = *len as usize;
1260                            let expanded = nulls.expand(element_len);
1261                            self.validate_non_nullable(Some(&expanded), child)?;
1262                        }
1263                        None => self.validate_non_nullable(None, child)?,
1264                    }
1265                }
1266            }
1267            DataType::Struct(fields) => {
1268                for (field, child) in fields.iter().zip(&self.child_data) {
1269                    if !field.is_nullable() {
1270                        self.validate_non_nullable(self.nulls(), child)?
1271                    }
1272                }
1273            }
1274            _ => {}
1275        }
1276
1277        Ok(())
1278    }
1279
1280    /// Verifies that `child` contains no nulls not present in `mask`
1281    fn validate_non_nullable(
1282        &self,
1283        mask: Option<&NullBuffer>,
1284        child: &ArrayData,
1285    ) -> Result<(), ArrowError> {
1286        let mask = match mask {
1287            Some(mask) => mask,
1288            None => {
1289                return match child.null_count() {
1290                    0 => Ok(()),
1291                    _ => Err(ArrowError::InvalidArgumentError(format!(
1292                        "non-nullable child of type {} contains nulls not present in parent {}",
1293                        child.data_type, self.data_type
1294                    ))),
1295                };
1296            }
1297        };
1298
1299        match child.nulls() {
1300            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1301                "non-nullable child of type {} contains nulls not present in parent",
1302                child.data_type
1303            ))),
1304            _ => Ok(()),
1305        }
1306    }
1307
1308    /// Validates the values stored within this [`ArrayData`] are valid
1309    /// without recursing into child [`ArrayData`]
1310    ///
1311    /// Does not (yet) check
1312    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1313    pub fn validate_values(&self) -> Result<(), ArrowError> {
1314        match &self.data_type {
1315            DataType::Utf8 => self.validate_utf8::<i32>(),
1316            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1317            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1318            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1319            DataType::BinaryView => {
1320                let views = self.typed_buffer::<u128>(0, self.len)?;
1321                validate_binary_view(views, &self.buffers[1..])
1322            }
1323            DataType::Utf8View => {
1324                let views = self.typed_buffer::<u128>(0, self.len)?;
1325                validate_string_view(views, &self.buffers[1..])
1326            }
1327            DataType::List(_) | DataType::Map(_, _) => {
1328                let child = &self.child_data[0];
1329                self.validate_offsets_full::<i32>(child.len)
1330            }
1331            DataType::LargeList(_) => {
1332                let child = &self.child_data[0];
1333                self.validate_offsets_full::<i64>(child.len)
1334            }
1335            DataType::Union(_, _) => {
1336                // Validate Union Array as part of implementing new Union semantics
1337                // See comments in `ArrayData::validate()`
1338                // https://github.com/apache/arrow-rs/issues/85
1339                //
1340                // TODO file follow on ticket for full union validation
1341                Ok(())
1342            }
1343            DataType::Dictionary(key_type, _value_type) => {
1344                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1345                let max_value = dictionary_length - 1;
1346                match key_type.as_ref() {
1347                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1348                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1349                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1350                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1351                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1352                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1353                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1354                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1355                    _ => unreachable!(),
1356                }
1357            }
1358            DataType::RunEndEncoded(run_ends, _values) => {
1359                let run_ends_data = self.child_data()[0].clone();
1360                match run_ends.data_type() {
1361                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1362                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1363                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1364                    _ => unreachable!(),
1365                }
1366            }
1367            _ => {
1368                // No extra validation check required for other types
1369                Ok(())
1370            }
1371        }
1372    }
1373
1374    /// Calls the `validate(item_index, range)` function for each of
1375    /// the ranges specified in the arrow offsets buffer of type
1376    /// `T`. Also validates that each offset is smaller than
1377    /// `offset_limit`
1378    ///
1379    /// For an empty array, the offsets buffer can either be empty
1380    /// or contain a single `0`.
1381    ///
1382    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1383    /// function would call `validate([1,2])`, and `validate([2,4])`
1384    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1385    where
1386        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1387        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1388    {
1389        self.typed_offsets::<T>()?
1390            .iter()
1391            .enumerate()
1392            .map(|(i, x)| {
1393                // check if the offset can be converted to usize
1394                let r = x.to_usize().ok_or_else(|| {
1395                    ArrowError::InvalidArgumentError(format!(
1396                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1397                    );
1398                // check if the offset exceeds the limit
1399                match r {
1400                    Ok(n) if n <= offset_limit => Ok((i, n)),
1401                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1402                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1403                    ),
1404                    Err(e) => Err(e),
1405                }
1406            })
1407            .scan(0_usize, |start, end| {
1408                // check offsets are monotonically increasing
1409                match end {
1410                    Ok((i, end)) if *start <= end => {
1411                        let range = Some(Ok((i, *start..end)));
1412                        *start = end;
1413                        range
1414                    }
1415                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1416                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1417                        i - 1, start, end))
1418                    )),
1419                    Err(err) => Some(Err(err)),
1420                }
1421            })
1422            .skip(1) // the first element is meaningless
1423            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1424                let (item_index, range) = res?;
1425                validate(item_index-1, range)
1426            })
1427    }
1428
1429    /// Ensures that all strings formed by the offsets in `buffers[0]`
1430    /// into `buffers[1]` are valid utf8 sequences
1431    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1432    where
1433        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1434    {
1435        let values_buffer = &self.buffers[1].as_slice();
1436        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1437            // Validate Offsets are correct
1438            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1439                if !values_str.is_char_boundary(range.start)
1440                    || !values_str.is_char_boundary(range.end)
1441                {
1442                    return Err(ArrowError::InvalidArgumentError(format!(
1443                        "incomplete utf-8 byte sequence from index {string_index}"
1444                    )));
1445                }
1446                Ok(())
1447            })
1448        } else {
1449            // find specific offset that failed utf8 validation
1450            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1451                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1452                    ArrowError::InvalidArgumentError(format!(
1453                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1454                    ))
1455                })?;
1456                Ok(())
1457            })
1458        }
1459    }
1460
1461    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1462    /// between `0` and `offset_limit`
1463    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1464    where
1465        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1466    {
1467        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1468            // No validation applied to each value, but the iteration
1469            // itself applies bounds checking to each range
1470            Ok(())
1471        })
1472    }
1473
1474    /// Validates that each value in self.buffers (typed as T)
1475    /// is within the range [0, max_value], inclusive
1476    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1477    where
1478        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1479    {
1480        let required_len = self.len + self.offset;
1481        let buffer = &self.buffers[0];
1482
1483        // This should have been checked as part of `validate()` prior
1484        // to calling `validate_full()` but double check to be sure
1485        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1486
1487        // Justification: buffer size was validated above
1488        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1489
1490        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1491            // Do not check the value is null (value can be arbitrary)
1492            if self.is_null(i) {
1493                return Ok(());
1494            }
1495            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1496                ArrowError::InvalidArgumentError(format!(
1497                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1498                ))
1499            })?;
1500
1501            if dict_index < 0 || dict_index > max_value {
1502                return Err(ArrowError::InvalidArgumentError(format!(
1503                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1504                )));
1505            }
1506            Ok(())
1507        })
1508    }
1509
1510    /// Validates that each value in run_ends array is positive and strictly increasing.
1511    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1512    where
1513        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1514    {
1515        let values = self.typed_buffer::<T>(0, self.len)?;
1516        let mut prev_value: i64 = 0_i64;
1517        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1518            let value: i64 = inp_value.try_into().map_err(|_| {
1519                ArrowError::InvalidArgumentError(format!(
1520                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1521                ))
1522            })?;
1523            if value <= 0_i64 {
1524                return Err(ArrowError::InvalidArgumentError(format!(
1525                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1526                )));
1527            }
1528            if ix > 0 && value <= prev_value {
1529                return Err(ArrowError::InvalidArgumentError(format!(
1530                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1531                )));
1532            }
1533
1534            prev_value = value;
1535            Ok(())
1536        })?;
1537
1538        if prev_value.as_usize() < (self.offset + self.len) {
1539            return Err(ArrowError::InvalidArgumentError(format!(
1540                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1541                self.offset + self.len
1542            )));
1543        }
1544        Ok(())
1545    }
1546
1547    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1548    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1549    /// return false when the arrays are logically equal
1550    pub fn ptr_eq(&self, other: &Self) -> bool {
1551        if self.offset != other.offset
1552            || self.len != other.len
1553            || self.data_type != other.data_type
1554            || self.buffers.len() != other.buffers.len()
1555            || self.child_data.len() != other.child_data.len()
1556        {
1557            return false;
1558        }
1559
1560        match (&self.nulls, &other.nulls) {
1561            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1562            (Some(_), None) | (None, Some(_)) => return false,
1563            _ => {}
1564        };
1565
1566        if !self
1567            .buffers
1568            .iter()
1569            .zip(other.buffers.iter())
1570            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1571        {
1572            return false;
1573        }
1574
1575        self.child_data
1576            .iter()
1577            .zip(other.child_data.iter())
1578            .all(|(a, b)| a.ptr_eq(b))
1579    }
1580
1581    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1582    pub fn into_builder(self) -> ArrayDataBuilder {
1583        self.into()
1584    }
1585}
1586
1587/// Return the expected [`DataTypeLayout`] Arrays of this data
1588/// type are expected to have
1589pub fn layout(data_type: &DataType) -> DataTypeLayout {
1590    // based on C/C++ implementation in
1591    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1592    use arrow_schema::IntervalUnit::*;
1593
1594    match data_type {
1595        DataType::Null => DataTypeLayout {
1596            buffers: vec![],
1597            can_contain_null_mask: false,
1598            variadic: false,
1599        },
1600        DataType::Boolean => DataTypeLayout {
1601            buffers: vec![BufferSpec::BitMap],
1602            can_contain_null_mask: true,
1603            variadic: false,
1604        },
1605        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1606        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1607        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1608        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1609        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1610        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1611        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1612        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1613        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1614        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1615        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1616        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1617        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1618        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1619        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1620        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1621        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1622        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1623        DataType::Interval(MonthDayNano) => {
1624            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1625        }
1626        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1627        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1628        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1629        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1630        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1631        DataType::FixedSizeBinary(size) => {
1632            let spec = BufferSpec::FixedWidth {
1633                byte_width: (*size).try_into().unwrap(),
1634                alignment: mem::align_of::<u8>(),
1635            };
1636            DataTypeLayout {
1637                buffers: vec![spec],
1638                can_contain_null_mask: true,
1639                variadic: false,
1640            }
1641        }
1642        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1643        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1644        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1645        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1646        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1647        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1648        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1649        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1650        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1651        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1652        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1653        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1654        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1655        DataType::Union(_, mode) => {
1656            let type_ids = BufferSpec::FixedWidth {
1657                byte_width: mem::size_of::<i8>(),
1658                alignment: mem::align_of::<i8>(),
1659            };
1660
1661            DataTypeLayout {
1662                buffers: match mode {
1663                    UnionMode::Sparse => {
1664                        vec![type_ids]
1665                    }
1666                    UnionMode::Dense => {
1667                        vec![
1668                            type_ids,
1669                            BufferSpec::FixedWidth {
1670                                byte_width: mem::size_of::<i32>(),
1671                                alignment: mem::align_of::<i32>(),
1672                            },
1673                        ]
1674                    }
1675                },
1676                can_contain_null_mask: false,
1677                variadic: false,
1678            }
1679        }
1680        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1681    }
1682}
1683
1684/// Layout specification for a data type
1685#[derive(Debug, PartialEq, Eq)]
1686// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1687pub struct DataTypeLayout {
1688    /// A vector of buffer layout specifications, one for each expected buffer
1689    pub buffers: Vec<BufferSpec>,
1690
1691    /// Can contain a null bitmask
1692    pub can_contain_null_mask: bool,
1693
1694    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1695    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1696    /// buffers.len(). Buffers that exceed the lower bound are legal.
1697    pub variadic: bool,
1698}
1699
1700impl DataTypeLayout {
1701    /// Describes a basic numeric array where each element has type `T`
1702    pub fn new_fixed_width<T>() -> Self {
1703        Self {
1704            buffers: vec![BufferSpec::FixedWidth {
1705                byte_width: mem::size_of::<T>(),
1706                alignment: mem::align_of::<T>(),
1707            }],
1708            can_contain_null_mask: true,
1709            variadic: false,
1710        }
1711    }
1712
1713    /// Describes arrays which have no data of their own
1714    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1715    pub fn new_nullable_empty() -> Self {
1716        Self {
1717            buffers: vec![],
1718            can_contain_null_mask: true,
1719            variadic: false,
1720        }
1721    }
1722
1723    /// Describes arrays which have no data of their own
1724    /// (e.g. RunEndEncoded).
1725    pub fn new_empty() -> Self {
1726        Self {
1727            buffers: vec![],
1728            can_contain_null_mask: false,
1729            variadic: false,
1730        }
1731    }
1732
1733    /// Describes a basic numeric array where each element has a fixed
1734    /// with offset buffer of type `T`, followed by a
1735    /// variable width data buffer
1736    pub fn new_binary<T>() -> Self {
1737        Self {
1738            buffers: vec![
1739                // offsets
1740                BufferSpec::FixedWidth {
1741                    byte_width: mem::size_of::<T>(),
1742                    alignment: mem::align_of::<T>(),
1743                },
1744                // values
1745                BufferSpec::VariableWidth,
1746            ],
1747            can_contain_null_mask: true,
1748            variadic: false,
1749        }
1750    }
1751
1752    /// Describes a view type
1753    pub fn new_view() -> Self {
1754        Self {
1755            buffers: vec![BufferSpec::FixedWidth {
1756                byte_width: mem::size_of::<u128>(),
1757                alignment: mem::align_of::<u128>(),
1758            }],
1759            can_contain_null_mask: true,
1760            variadic: true,
1761        }
1762    }
1763
1764    /// Describes a list view type
1765    pub fn new_list_view<T>() -> Self {
1766        Self {
1767            buffers: vec![
1768                BufferSpec::FixedWidth {
1769                    byte_width: mem::size_of::<T>(),
1770                    alignment: mem::align_of::<T>(),
1771                },
1772                BufferSpec::FixedWidth {
1773                    byte_width: mem::size_of::<T>(),
1774                    alignment: mem::align_of::<T>(),
1775                },
1776            ],
1777            can_contain_null_mask: true,
1778            variadic: true,
1779        }
1780    }
1781}
1782
1783/// Layout specification for a single data type buffer
1784#[derive(Debug, PartialEq, Eq)]
1785pub enum BufferSpec {
1786    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1787    ///
1788    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1789    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1790    ///
1791    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1792    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1793    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1794    ///
1795    /// Note that these alignment requirements will vary between architectures
1796    FixedWidth {
1797        /// The width of each element in bytes
1798        byte_width: usize,
1799        /// The alignment required by Rust for an array of the corresponding primitive
1800        alignment: usize,
1801    },
1802    /// Variable width, such as string data for utf8 data
1803    VariableWidth,
1804    /// Buffer holds a bitmap.
1805    ///
1806    /// Note: Unlike the C++ implementation, the null/validity buffer
1807    /// is handled specially rather than as another of the buffers in
1808    /// the spec, so this variant is only used for the Boolean type.
1809    BitMap,
1810    /// Buffer is always null. Unused currently in Rust implementation,
1811    /// (used in C++ for Union type)
1812    #[allow(dead_code)]
1813    AlwaysNull,
1814}
1815
1816impl PartialEq for ArrayData {
1817    fn eq(&self, other: &Self) -> bool {
1818        equal::equal(self, other)
1819    }
1820}
1821
1822/// A boolean flag that cannot be mutated outside of unsafe code.
1823///
1824/// Defaults to a value of false.
1825///
1826/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1827///
1828/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1829///
1830/// # Example
1831/// ```rust
1832/// use arrow_data::UnsafeFlag;
1833/// assert!(!UnsafeFlag::default().get()); // default is false
1834/// let mut flag = UnsafeFlag::new();
1835/// assert!(!flag.get()); // defaults to false
1836/// // can only set it to true in unsafe code
1837/// unsafe { flag.set(true) };
1838/// assert!(flag.get()); // now true
1839/// ```
1840#[derive(Debug, Clone)]
1841#[doc(hidden)]
1842pub struct UnsafeFlag(bool);
1843
1844impl UnsafeFlag {
1845    /// Creates a new `UnsafeFlag` with the value set to `false`.
1846    ///
1847    /// See examples on [`Self::new`]
1848    #[inline]
1849    pub const fn new() -> Self {
1850        Self(false)
1851    }
1852
1853    /// Sets the value of the flag to the given value
1854    ///
1855    /// Note this can purposely only be done in `unsafe` code
1856    ///
1857    /// # Safety
1858    ///
1859    /// If set, the flag will be set to the given value. There is nothing
1860    /// immediately unsafe about doing so, however, the flag can be used to
1861    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1862    #[inline]
1863    pub unsafe fn set(&mut self, val: bool) {
1864        self.0 = val;
1865    }
1866
1867    /// Returns the value of the flag
1868    #[inline]
1869    pub fn get(&self) -> bool {
1870        self.0
1871    }
1872}
1873
1874// Manual impl to make it clear you can not construct unsafe with true
1875impl Default for UnsafeFlag {
1876    fn default() -> Self {
1877        Self::new()
1878    }
1879}
1880
1881/// Builder for [`ArrayData`] type
1882#[derive(Debug)]
1883pub struct ArrayDataBuilder {
1884    data_type: DataType,
1885    len: usize,
1886    null_count: Option<usize>,
1887    null_bit_buffer: Option<Buffer>,
1888    nulls: Option<NullBuffer>,
1889    offset: usize,
1890    buffers: Vec<Buffer>,
1891    child_data: Vec<ArrayData>,
1892    /// Should buffers be realigned (copying if necessary)?
1893    ///
1894    /// Defaults to false.
1895    align_buffers: bool,
1896    /// Should data validation be skipped for this [`ArrayData`]?
1897    ///
1898    /// Defaults to false.
1899    ///
1900    /// # Safety
1901    ///
1902    /// This flag can only be set to true using `unsafe` APIs. However, once true
1903    /// subsequent calls to `build()` may result in undefined behavior if the data
1904    /// is not valid.
1905    skip_validation: UnsafeFlag,
1906}
1907
1908impl ArrayDataBuilder {
1909    #[inline]
1910    /// Creates a new array data builder
1911    pub const fn new(data_type: DataType) -> Self {
1912        Self {
1913            data_type,
1914            len: 0,
1915            null_count: None,
1916            null_bit_buffer: None,
1917            nulls: None,
1918            offset: 0,
1919            buffers: vec![],
1920            child_data: vec![],
1921            align_buffers: false,
1922            skip_validation: UnsafeFlag::new(),
1923        }
1924    }
1925
1926    /// Creates a new array data builder from an existing one, changing the data type
1927    pub fn data_type(self, data_type: DataType) -> Self {
1928        Self { data_type, ..self }
1929    }
1930
1931    #[inline]
1932    #[allow(clippy::len_without_is_empty)]
1933    /// Sets the length of the [ArrayData]
1934    pub const fn len(mut self, n: usize) -> Self {
1935        self.len = n;
1936        self
1937    }
1938
1939    /// Sets the null buffer of the [ArrayData]
1940    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1941        self.nulls = nulls;
1942        self.null_count = None;
1943        self.null_bit_buffer = None;
1944        self
1945    }
1946
1947    /// Sets the null count of the [ArrayData]
1948    pub fn null_count(mut self, null_count: usize) -> Self {
1949        self.null_count = Some(null_count);
1950        self
1951    }
1952
1953    /// Sets the `null_bit_buffer` of the [ArrayData]
1954    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1955        self.nulls = None;
1956        self.null_bit_buffer = buf;
1957        self
1958    }
1959
1960    /// Sets the offset of the [ArrayData]
1961    #[inline]
1962    pub const fn offset(mut self, n: usize) -> Self {
1963        self.offset = n;
1964        self
1965    }
1966
1967    /// Sets the buffers of the [ArrayData]
1968    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1969        self.buffers = v;
1970        self
1971    }
1972
1973    /// Adds a single buffer to the [ArrayData]'s buffers
1974    pub fn add_buffer(mut self, b: Buffer) -> Self {
1975        self.buffers.push(b);
1976        self
1977    }
1978
1979    /// Adds multiple buffers to the [ArrayData]'s buffers
1980    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1981        self.buffers.extend(bs);
1982        self
1983    }
1984
1985    /// Sets the child data of the [ArrayData]
1986    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1987        self.child_data = v;
1988        self
1989    }
1990
1991    /// Adds a single child data to the [ArrayData]'s child data
1992    pub fn add_child_data(mut self, r: ArrayData) -> Self {
1993        self.child_data.push(r);
1994        self
1995    }
1996
1997    /// Creates an array data, without any validation
1998    ///
1999    /// Note: This is shorthand for
2000    /// ```rust
2001    /// # #[expect(unsafe_op_in_unsafe_fn)]
2002    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2003    /// # let _ = unsafe {
2004    /// builder.skip_validation(true).build().unwrap()
2005    /// # };
2006    /// ```
2007    ///
2008    /// # Safety
2009    ///
2010    /// The same caveats as [`ArrayData::new_unchecked`]
2011    /// apply.
2012    pub unsafe fn build_unchecked(self) -> ArrayData {
2013        unsafe { self.skip_validation(true) }.build().unwrap()
2014    }
2015
2016    /// Creates an `ArrayData`, consuming `self`
2017    ///
2018    /// # Safety
2019    ///
2020    /// By default the underlying buffers are checked to ensure they are valid
2021    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2022    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2023    /// not valid, undefined behavior will result.
2024    pub fn build(self) -> Result<ArrayData, ArrowError> {
2025        let Self {
2026            data_type,
2027            len,
2028            null_count,
2029            null_bit_buffer,
2030            nulls,
2031            offset,
2032            buffers,
2033            child_data,
2034            align_buffers,
2035            skip_validation,
2036        } = self;
2037
2038        let nulls = nulls
2039            .or_else(|| {
2040                let buffer = null_bit_buffer?;
2041                let buffer = BooleanBuffer::new(buffer, offset, len);
2042                Some(match null_count {
2043                    Some(n) => {
2044                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2045                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2046                    }
2047                    None => NullBuffer::new(buffer),
2048                })
2049            })
2050            .filter(|b| b.null_count() != 0);
2051
2052        let mut data = ArrayData {
2053            data_type,
2054            len,
2055            offset,
2056            buffers,
2057            child_data,
2058            nulls,
2059        };
2060
2061        if align_buffers {
2062            data.align_buffers();
2063        }
2064
2065        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2066        if !skip_validation.get() || cfg!(feature = "force_validate") {
2067            data.validate_data()?;
2068        }
2069        Ok(data)
2070    }
2071
2072    /// Creates an array data, validating all inputs, and aligning any buffers
2073    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2074    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2075        self.align_buffers(true).build()
2076    }
2077
2078    /// Ensure that all buffers are aligned, copying data if necessary
2079    ///
2080    /// Rust requires that arrays are aligned to their corresponding primitive,
2081    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2082    ///
2083    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2084    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2085    ///
2086    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2087    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2088    /// when necessary, making it useful when interacting with buffers produced by other systems,
2089    /// e.g. IPC or FFI.
2090    ///
2091    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2092    /// insufficiently aligned buffers.
2093    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2094        self.align_buffers = align_buffers;
2095        self
2096    }
2097
2098    /// Skips validation of the data.
2099    ///
2100    /// If this flag is enabled, `[Self::build`] will skip validation of the
2101    /// data
2102    ///
2103    /// If this flag is not enabled, `[Self::build`] will validate that all
2104    /// buffers are valid and will return an error if any data is invalid.
2105    /// Validation can be expensive.
2106    ///
2107    /// # Safety
2108    ///
2109    /// If validation is skipped, the buffers must form a valid Arrow array,
2110    /// otherwise undefined behavior will result
2111    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2112        unsafe {
2113            self.skip_validation.set(skip_validation);
2114        }
2115        self
2116    }
2117}
2118
2119impl From<ArrayData> for ArrayDataBuilder {
2120    fn from(d: ArrayData) -> Self {
2121        Self {
2122            data_type: d.data_type,
2123            len: d.len,
2124            offset: d.offset,
2125            buffers: d.buffers,
2126            child_data: d.child_data,
2127            nulls: d.nulls,
2128            null_bit_buffer: None,
2129            null_count: None,
2130            align_buffers: false,
2131            skip_validation: UnsafeFlag::new(),
2132        }
2133    }
2134}
2135
2136#[cfg(test)]
2137mod tests {
2138    use super::*;
2139    use arrow_schema::{Field, Fields};
2140
2141    // See arrow/tests/array_data_validation.rs for test of array validation
2142
2143    /// returns a buffer initialized with some constant value for tests
2144    fn make_i32_buffer(n: usize) -> Buffer {
2145        Buffer::from_slice_ref(vec![42i32; n])
2146    }
2147
2148    /// returns a buffer initialized with some constant value for tests
2149    fn make_f32_buffer(n: usize) -> Buffer {
2150        Buffer::from_slice_ref(vec![42f32; n])
2151    }
2152
2153    #[test]
2154    fn test_builder() {
2155        // Buffer needs to be at least 25 long
2156        let v = (0..25).collect::<Vec<i32>>();
2157        let b1 = Buffer::from_slice_ref(&v);
2158        let arr_data = ArrayData::builder(DataType::Int32)
2159            .len(20)
2160            .offset(5)
2161            .add_buffer(b1)
2162            .null_bit_buffer(Some(Buffer::from([
2163                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2164            ])))
2165            .build()
2166            .unwrap();
2167
2168        assert_eq!(20, arr_data.len());
2169        assert_eq!(10, arr_data.null_count());
2170        assert_eq!(5, arr_data.offset());
2171        assert_eq!(1, arr_data.buffers().len());
2172        assert_eq!(
2173            Buffer::from_slice_ref(&v).as_slice(),
2174            arr_data.buffers()[0].as_slice()
2175        );
2176    }
2177
2178    #[test]
2179    fn test_builder_with_child_data() {
2180        let child_arr_data = ArrayData::try_new(
2181            DataType::Int32,
2182            5,
2183            None,
2184            0,
2185            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2186            vec![],
2187        )
2188        .unwrap();
2189
2190        let field = Arc::new(Field::new("x", DataType::Int32, true));
2191        let data_type = DataType::Struct(vec![field].into());
2192
2193        let arr_data = ArrayData::builder(data_type)
2194            .len(5)
2195            .offset(0)
2196            .add_child_data(child_arr_data.clone())
2197            .build()
2198            .unwrap();
2199
2200        assert_eq!(5, arr_data.len());
2201        assert_eq!(1, arr_data.child_data().len());
2202        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2203    }
2204
2205    #[test]
2206    fn test_null_count() {
2207        let mut bit_v: [u8; 2] = [0; 2];
2208        bit_util::set_bit(&mut bit_v, 0);
2209        bit_util::set_bit(&mut bit_v, 3);
2210        bit_util::set_bit(&mut bit_v, 10);
2211        let arr_data = ArrayData::builder(DataType::Int32)
2212            .len(16)
2213            .add_buffer(make_i32_buffer(16))
2214            .null_bit_buffer(Some(Buffer::from(bit_v)))
2215            .build()
2216            .unwrap();
2217        assert_eq!(13, arr_data.null_count());
2218
2219        // Test with offset
2220        let mut bit_v: [u8; 2] = [0; 2];
2221        bit_util::set_bit(&mut bit_v, 0);
2222        bit_util::set_bit(&mut bit_v, 3);
2223        bit_util::set_bit(&mut bit_v, 10);
2224        let arr_data = ArrayData::builder(DataType::Int32)
2225            .len(12)
2226            .offset(2)
2227            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2228            .null_bit_buffer(Some(Buffer::from(bit_v)))
2229            .build()
2230            .unwrap();
2231        assert_eq!(10, arr_data.null_count());
2232    }
2233
2234    #[test]
2235    fn test_null_buffer_ref() {
2236        let mut bit_v: [u8; 2] = [0; 2];
2237        bit_util::set_bit(&mut bit_v, 0);
2238        bit_util::set_bit(&mut bit_v, 3);
2239        bit_util::set_bit(&mut bit_v, 10);
2240        let arr_data = ArrayData::builder(DataType::Int32)
2241            .len(16)
2242            .add_buffer(make_i32_buffer(16))
2243            .null_bit_buffer(Some(Buffer::from(bit_v)))
2244            .build()
2245            .unwrap();
2246        assert!(arr_data.nulls().is_some());
2247        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2248    }
2249
2250    #[test]
2251    fn test_slice() {
2252        let mut bit_v: [u8; 2] = [0; 2];
2253        bit_util::set_bit(&mut bit_v, 0);
2254        bit_util::set_bit(&mut bit_v, 3);
2255        bit_util::set_bit(&mut bit_v, 10);
2256        let data = ArrayData::builder(DataType::Int32)
2257            .len(16)
2258            .add_buffer(make_i32_buffer(16))
2259            .null_bit_buffer(Some(Buffer::from(bit_v)))
2260            .build()
2261            .unwrap();
2262        let new_data = data.slice(1, 15);
2263        assert_eq!(data.len() - 1, new_data.len());
2264        assert_eq!(1, new_data.offset());
2265        assert_eq!(data.null_count(), new_data.null_count());
2266
2267        // slice of a slice (removes one null)
2268        let new_data = new_data.slice(1, 14);
2269        assert_eq!(data.len() - 2, new_data.len());
2270        assert_eq!(2, new_data.offset());
2271        assert_eq!(data.null_count() - 1, new_data.null_count());
2272    }
2273
2274    #[test]
2275    fn test_equality() {
2276        let int_data = ArrayData::builder(DataType::Int32)
2277            .len(1)
2278            .add_buffer(make_i32_buffer(1))
2279            .build()
2280            .unwrap();
2281
2282        let float_data = ArrayData::builder(DataType::Float32)
2283            .len(1)
2284            .add_buffer(make_f32_buffer(1))
2285            .build()
2286            .unwrap();
2287        assert_ne!(int_data, float_data);
2288        assert!(!int_data.ptr_eq(&float_data));
2289        assert!(int_data.ptr_eq(&int_data));
2290
2291        #[allow(clippy::redundant_clone)]
2292        let int_data_clone = int_data.clone();
2293        assert_eq!(int_data, int_data_clone);
2294        assert!(int_data.ptr_eq(&int_data_clone));
2295        assert!(int_data_clone.ptr_eq(&int_data));
2296
2297        let int_data_slice = int_data_clone.slice(1, 0);
2298        assert!(int_data_slice.ptr_eq(&int_data_slice));
2299        assert!(!int_data.ptr_eq(&int_data_slice));
2300        assert!(!int_data_slice.ptr_eq(&int_data));
2301
2302        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2303        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2304        let string_data = ArrayData::try_new(
2305            DataType::Utf8,
2306            3,
2307            Some(Buffer::from_iter(vec![true, false, true])),
2308            0,
2309            vec![offsets_buffer, data_buffer],
2310            vec![],
2311        )
2312        .unwrap();
2313
2314        assert_ne!(float_data, string_data);
2315        assert!(!float_data.ptr_eq(&string_data));
2316
2317        assert!(string_data.ptr_eq(&string_data));
2318
2319        #[allow(clippy::redundant_clone)]
2320        let string_data_cloned = string_data.clone();
2321        assert!(string_data_cloned.ptr_eq(&string_data));
2322        assert!(string_data.ptr_eq(&string_data_cloned));
2323
2324        let string_data_slice = string_data.slice(1, 2);
2325        assert!(string_data_slice.ptr_eq(&string_data_slice));
2326        assert!(!string_data_slice.ptr_eq(&string_data))
2327    }
2328
2329    #[test]
2330    fn test_slice_memory_size() {
2331        let mut bit_v: [u8; 2] = [0; 2];
2332        bit_util::set_bit(&mut bit_v, 0);
2333        bit_util::set_bit(&mut bit_v, 3);
2334        bit_util::set_bit(&mut bit_v, 10);
2335        let data = ArrayData::builder(DataType::Int32)
2336            .len(16)
2337            .add_buffer(make_i32_buffer(16))
2338            .null_bit_buffer(Some(Buffer::from(bit_v)))
2339            .build()
2340            .unwrap();
2341        let new_data = data.slice(1, 14);
2342        assert_eq!(
2343            data.get_slice_memory_size().unwrap() - 8,
2344            new_data.get_slice_memory_size().unwrap()
2345        );
2346        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2347        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2348        let string_data = ArrayData::try_new(
2349            DataType::Utf8,
2350            3,
2351            Some(Buffer::from_iter(vec![true, false, true])),
2352            0,
2353            vec![offsets_buffer, data_buffer],
2354            vec![],
2355        )
2356        .unwrap();
2357        let string_data_slice = string_data.slice(1, 2);
2358        //4 bytes of offset and 2 bytes of data reduced by slicing.
2359        assert_eq!(
2360            string_data.get_slice_memory_size().unwrap() - 6,
2361            string_data_slice.get_slice_memory_size().unwrap()
2362        );
2363    }
2364
2365    #[test]
2366    fn test_count_nulls() {
2367        let buffer = Buffer::from([0b00010110, 0b10011111]);
2368        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2369        let count = count_nulls(Some(&buffer), 0, 16);
2370        assert_eq!(count, 7);
2371
2372        let count = count_nulls(Some(&buffer), 4, 8);
2373        assert_eq!(count, 3);
2374    }
2375
2376    #[test]
2377    fn test_contains_nulls() {
2378        let buffer: Buffer =
2379            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2380        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2381        assert!(contains_nulls(Some(&buffer), 0, 6));
2382        assert!(contains_nulls(Some(&buffer), 0, 3));
2383        assert!(!contains_nulls(Some(&buffer), 3, 2));
2384        assert!(!contains_nulls(Some(&buffer), 0, 0));
2385    }
2386
2387    #[test]
2388    fn test_alignment() {
2389        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2390        let sliced = buffer.slice(1);
2391
2392        let mut data = ArrayData {
2393            data_type: DataType::Int32,
2394            len: 0,
2395            offset: 0,
2396            buffers: vec![buffer],
2397            child_data: vec![],
2398            nulls: None,
2399        };
2400        data.validate_full().unwrap();
2401
2402        // break alignment in data
2403        data.buffers[0] = sliced;
2404        let err = data.validate().unwrap_err();
2405
2406        assert_eq!(
2407            err.to_string(),
2408            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2409        );
2410
2411        data.align_buffers();
2412        data.validate_full().unwrap();
2413    }
2414
2415    #[test]
2416    fn test_alignment_struct() {
2417        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2418        let sliced = buffer.slice(1);
2419
2420        let child_data = ArrayData {
2421            data_type: DataType::Int32,
2422            len: 0,
2423            offset: 0,
2424            buffers: vec![buffer],
2425            child_data: vec![],
2426            nulls: None,
2427        };
2428
2429        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2430        let mut data = ArrayData {
2431            data_type: schema,
2432            len: 0,
2433            offset: 0,
2434            buffers: vec![],
2435            child_data: vec![child_data],
2436            nulls: None,
2437        };
2438        data.validate_full().unwrap();
2439
2440        // break alignment in child data
2441        data.child_data[0].buffers[0] = sliced;
2442        let err = data.validate().unwrap_err();
2443
2444        assert_eq!(
2445            err.to_string(),
2446            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2447        );
2448
2449        data.align_buffers();
2450        data.validate_full().unwrap();
2451    }
2452
2453    #[test]
2454    fn test_null_view_types() {
2455        let array_len = 32;
2456        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2457        assert_eq!(array.len(), array_len);
2458        for i in 0..array.len() {
2459            assert!(array.is_null(i));
2460        }
2461
2462        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2463        assert_eq!(array.len(), array_len);
2464        for i in 0..array.len() {
2465            assert!(array.is_null(i));
2466        }
2467    }
2468}