arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257    /// Create a new ArrayData instance;
258    ///
259    /// If `null_count` is not specified, the number of nulls in
260    /// null_bit_buffer is calculated.
261    ///
262    /// If the number of nulls is 0 then the null_bit_buffer
263    /// is set to `None`.
264    ///
265    /// # Safety
266    ///
267    /// The input values *must* form a valid Arrow array for
268    /// `data_type`, or undefined behavior can result.
269    ///
270    /// Note: This is a low level API and most users of the arrow
271    /// crate should create arrays using the methods in the `array`
272    /// module.
273    pub unsafe fn new_unchecked(
274        data_type: DataType,
275        len: usize,
276        null_count: Option<usize>,
277        null_bit_buffer: Option<Buffer>,
278        offset: usize,
279        buffers: Vec<Buffer>,
280        child_data: Vec<ArrayData>,
281    ) -> Self {
282        let mut skip_validation = UnsafeFlag::new();
283        // SAFETY: caller responsible for ensuring data is valid
284        unsafe { skip_validation.set(true) };
285
286        ArrayDataBuilder {
287            data_type,
288            len,
289            null_count,
290            null_bit_buffer,
291            nulls: None,
292            offset,
293            buffers,
294            child_data,
295            align_buffers: false,
296            skip_validation,
297        }
298        .build()
299        .unwrap()
300    }
301
302    /// Create a new ArrayData, validating that the provided buffers form a valid
303    /// Arrow array of the specified data type.
304    ///
305    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306    /// is set to `None`.
307    ///
308    /// Internally this calls through to [`Self::validate_data`]
309    ///
310    /// Note: This is a low level API and most users of the arrow crate should create
311    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312    pub fn try_new(
313        data_type: DataType,
314        len: usize,
315        null_bit_buffer: Option<Buffer>,
316        offset: usize,
317        buffers: Vec<Buffer>,
318        child_data: Vec<ArrayData>,
319    ) -> Result<Self, ArrowError> {
320        // we must check the length of `null_bit_buffer` first
321        // because we use this buffer to calculate `null_count`
322        // in `Self::new_unchecked`.
323        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324            let needed_len = bit_util::ceil(len + offset, 8);
325            if null_bit_buffer.len() < needed_len {
326                return Err(ArrowError::InvalidArgumentError(format!(
327                    "null_bit_buffer size too small. got {} needed {}",
328                    null_bit_buffer.len(),
329                    needed_len
330                )));
331            }
332        }
333        // Safety justification: `validate_full` is called below
334        let new_self = unsafe {
335            Self::new_unchecked(
336                data_type,
337                len,
338                None,
339                null_bit_buffer,
340                offset,
341                buffers,
342                child_data,
343            )
344        };
345
346        // As the data is not trusted, do a full validation of its contents
347        // We don't need to validate children as we can assume that the
348        // [`ArrayData`] in `child_data` have already been validated through
349        // a call to `ArrayData::try_new` or created using unsafe
350        new_self.validate_data()?;
351        Ok(new_self)
352    }
353
354    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
355    #[inline]
356    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357        ArrayDataBuilder::new(data_type)
358    }
359
360    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
361    #[inline]
362    pub const fn data_type(&self) -> &DataType {
363        &self.data_type
364    }
365
366    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
367    pub fn buffers(&self) -> &[Buffer] {
368        &self.buffers
369    }
370
371    /// Returns a slice of children [`ArrayData`]. This will be non
372    /// empty for type such as lists and structs.
373    pub fn child_data(&self) -> &[ArrayData] {
374        &self.child_data[..]
375    }
376
377    /// Returns whether the element at index `i` is null
378    #[inline]
379    pub fn is_null(&self, i: usize) -> bool {
380        match &self.nulls {
381            Some(v) => v.is_null(i),
382            None => false,
383        }
384    }
385
386    /// Returns a reference to the null buffer of this [`ArrayData`] if any
387    ///
388    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
389    #[inline]
390    pub fn nulls(&self) -> Option<&NullBuffer> {
391        self.nulls.as_ref()
392    }
393
394    /// Returns whether the element at index `i` is not null
395    #[inline]
396    pub fn is_valid(&self, i: usize) -> bool {
397        !self.is_null(i)
398    }
399
400    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
401    #[inline]
402    pub const fn len(&self) -> usize {
403        self.len
404    }
405
406    /// Returns whether this [`ArrayData`] is empty
407    #[inline]
408    pub const fn is_empty(&self) -> bool {
409        self.len == 0
410    }
411
412    /// Returns the offset of this [`ArrayData`]
413    #[inline]
414    pub const fn offset(&self) -> usize {
415        self.offset
416    }
417
418    /// Returns the total number of nulls in this array
419    #[inline]
420    pub fn null_count(&self) -> usize {
421        self.nulls
422            .as_ref()
423            .map(|x| x.null_count())
424            .unwrap_or_default()
425    }
426
427    /// Returns the total number of bytes of memory occupied by the
428    /// buffers owned by this [`ArrayData`] and all of its
429    /// children. (See also diagram on [`ArrayData`]).
430    ///
431    /// Note that this [`ArrayData`] may only refer to a subset of the
432    /// data in the underlying [`Buffer`]s (due to `offset` and
433    /// `length`), but the size returned includes the entire size of
434    /// the buffers.
435    ///
436    /// If multiple [`ArrayData`]s refer to the same underlying
437    /// [`Buffer`]s they will both report the same size.
438    pub fn get_buffer_memory_size(&self) -> usize {
439        let mut size = 0;
440        for buffer in &self.buffers {
441            size += buffer.capacity();
442        }
443        if let Some(bitmap) = &self.nulls {
444            size += bitmap.buffer().capacity()
445        }
446        for child in &self.child_data {
447            size += child.get_buffer_memory_size();
448        }
449        size
450    }
451
452    /// Returns the total number of the bytes of memory occupied by
453    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
454    ///
455    /// This is approximately the number of bytes if a new
456    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
457    /// exactly the data needed.
458    ///
459    /// For example, a [`DataType::Int64`] with `100` elements,
460    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
461    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
462    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
463    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
464    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465        let mut result: usize = 0;
466        let layout = layout(&self.data_type);
467
468        for spec in layout.buffers.iter() {
469            match spec {
470                BufferSpec::FixedWidth { byte_width, .. } => {
471                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472                        ArrowError::ComputeError(
473                            "Integer overflow computing buffer size".to_string(),
474                        )
475                    })?;
476                    result += buffer_size;
477                }
478                BufferSpec::VariableWidth => {
479                    let buffer_len = match self.data_type {
480                        DataType::Utf8 | DataType::Binary => {
481                            let offsets = self.typed_offsets::<i32>()?;
482                            (offsets[self.len] - offsets[0]) as usize
483                        }
484                        DataType::LargeUtf8 | DataType::LargeBinary => {
485                            let offsets = self.typed_offsets::<i64>()?;
486                            (offsets[self.len] - offsets[0]) as usize
487                        }
488                        _ => {
489                            return Err(ArrowError::NotYetImplemented(format!(
490                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491                                self.data_type
492                            )));
493                        }
494                    };
495                    result += buffer_len;
496                }
497                BufferSpec::BitMap => {
498                    let buffer_size = bit_util::ceil(self.len, 8);
499                    result += buffer_size;
500                }
501                BufferSpec::AlwaysNull => {
502                    // Nothing to do
503                }
504            }
505        }
506
507        if self.nulls().is_some() {
508            result += bit_util::ceil(self.len, 8);
509        }
510
511        for child in &self.child_data {
512            result += child.get_slice_memory_size()?;
513        }
514        Ok(result)
515    }
516
517    /// Returns the total number of bytes of memory occupied
518    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
519    /// children. (See also diagram on [`ArrayData`]).
520    ///
521    /// Equivalent to:
522    ///  `size_of_val(self)` +
523    ///  [`Self::get_buffer_memory_size`] +
524    ///  `size_of_val(child)` for all children
525    pub fn get_array_memory_size(&self) -> usize {
526        let mut size = mem::size_of_val(self);
527
528        // Calculate rest of the fields top down which contain actual data
529        for buffer in &self.buffers {
530            size += mem::size_of::<Buffer>();
531            size += buffer.capacity();
532        }
533        if let Some(nulls) = &self.nulls {
534            size += nulls.buffer().capacity();
535        }
536        for child in &self.child_data {
537            size += child.get_array_memory_size();
538        }
539
540        size
541    }
542
543    /// Creates a zero-copy slice of itself. This creates a new
544    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
545    /// different offset and len
546    ///
547    /// # Panics
548    ///
549    /// Panics if `offset + length > self.len()`.
550    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551        assert!((offset + length) <= self.len());
552
553        if let DataType::Struct(_) = self.data_type() {
554            // Slice into children
555            let new_offset = self.offset + offset;
556            ArrayData {
557                data_type: self.data_type().clone(),
558                len: length,
559                offset: new_offset,
560                buffers: self.buffers.clone(),
561                // Slice child data, to propagate offsets down to them
562                child_data: self
563                    .child_data()
564                    .iter()
565                    .map(|data| data.slice(offset, length))
566                    .collect(),
567                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568            }
569        } else {
570            let mut new_data = self.clone();
571
572            new_data.len = length;
573            new_data.offset = offset + self.offset;
574            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
575
576            new_data
577        }
578    }
579
580    /// Returns the `buffer` as a slice of type `T` starting at self.offset
581    ///
582    /// # Panics
583    /// This function panics if:
584    /// * the buffer is not byte-aligned with type T, or
585    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
586    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587        &self.buffers()[buffer].typed_data()[self.offset..]
588    }
589
590    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
591    pub fn new_null(data_type: &DataType, len: usize) -> Self {
592        let bit_len = bit_util::ceil(len, 8);
593        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596            Some(width) => (vec![zeroed(width * len)], vec![], true),
597            None => match data_type {
598                DataType::Null => (vec![], vec![], false),
599                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600                DataType::Binary | DataType::Utf8 => {
601                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602                }
603                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604                DataType::LargeBinary | DataType::LargeUtf8 => {
605                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606                }
607                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608                DataType::List(f) | DataType::Map(f, _) => (
609                    vec![zeroed((len + 1) * 4)],
610                    vec![ArrayData::new_empty(f.data_type())],
611                    true,
612                ),
613                DataType::LargeList(f) => (
614                    vec![zeroed((len + 1) * 8)],
615                    vec![ArrayData::new_empty(f.data_type())],
616                    true,
617                ),
618                DataType::FixedSizeList(f, list_len) => (
619                    vec![],
620                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
621                    true,
622                ),
623                DataType::Struct(fields) => (
624                    vec![],
625                    fields
626                        .iter()
627                        .map(|f| Self::new_null(f.data_type(), len))
628                        .collect(),
629                    true,
630                ),
631                DataType::Dictionary(k, v) => (
632                    vec![zeroed(k.primitive_width().unwrap() * len)],
633                    vec![ArrayData::new_empty(v.as_ref())],
634                    true,
635                ),
636                DataType::Union(f, mode) => {
637                    let (id, _) = f.iter().next().unwrap();
638                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
639                    let buffers = match mode {
640                        UnionMode::Sparse => vec![ids],
641                        UnionMode::Dense => {
642                            let end_offset = i32::from_usize(len).unwrap();
643                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
644                        }
645                    };
646
647                    let children = f
648                        .iter()
649                        .enumerate()
650                        .map(|(idx, (_, f))| {
651                            if idx == 0 || *mode == UnionMode::Sparse {
652                                Self::new_null(f.data_type(), len)
653                            } else {
654                                Self::new_empty(f.data_type())
655                            }
656                        })
657                        .collect();
658
659                    (buffers, children, false)
660                }
661                DataType::RunEndEncoded(r, v) => {
662                    let runs = match r.data_type() {
663                        DataType::Int16 => {
664                            let i = i16::from_usize(len).expect("run overflow");
665                            Buffer::from_slice_ref([i])
666                        }
667                        DataType::Int32 => {
668                            let i = i32::from_usize(len).expect("run overflow");
669                            Buffer::from_slice_ref([i])
670                        }
671                        DataType::Int64 => {
672                            let i = i64::from_usize(len).expect("run overflow");
673                            Buffer::from_slice_ref([i])
674                        }
675                        dt => unreachable!("Invalid run ends data type {dt}"),
676                    };
677
678                    let builder = ArrayData::builder(r.data_type().clone())
679                        .len(1)
680                        .buffers(vec![runs]);
681
682                    // SAFETY:
683                    // Valid by construction
684                    let runs = unsafe { builder.build_unchecked() };
685                    (
686                        vec![],
687                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
688                        false,
689                    )
690                }
691                d => unreachable!("{d}"),
692            },
693        };
694
695        let mut builder = ArrayDataBuilder::new(data_type.clone())
696            .len(len)
697            .buffers(buffers)
698            .child_data(child_data);
699
700        if has_nulls {
701            builder = builder.nulls(Some(NullBuffer::new_null(len)))
702        }
703
704        // SAFETY:
705        // Data valid by construction
706        unsafe { builder.build_unchecked() }
707    }
708
709    /// Returns a new empty [ArrayData] valid for `data_type`.
710    pub fn new_empty(data_type: &DataType) -> Self {
711        Self::new_null(data_type, 0)
712    }
713
714    /// Verifies that the buffers meet the minimum alignment requirements for the data type
715    ///
716    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
717    ///
718    /// This can be useful for when interacting with data sent over IPC or FFI, that may
719    /// not meet the minimum alignment requirements
720    ///
721    /// This also aligns buffers of children data
722    pub fn align_buffers(&mut self) {
723        let layout = layout(&self.data_type);
724        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
725            if let BufferSpec::FixedWidth { alignment, .. } = spec {
726                if buffer.as_ptr().align_offset(*alignment) != 0 {
727                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
728                }
729            }
730        }
731        // align children data recursively
732        for data in self.child_data.iter_mut() {
733            data.align_buffers()
734        }
735    }
736
737    /// "cheap" validation of an `ArrayData`. Ensures buffers are
738    /// sufficiently sized to store `len` + `offset` total elements of
739    /// `data_type` and performs other inexpensive consistency checks.
740    ///
741    /// This check is "cheap" in the sense that it does not validate the
742    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
743    /// are within the bounds of the values buffer).
744    ///
745    /// See [ArrayData::validate_data] to validate fully the offset content
746    /// and the validity of utf8 data
747    pub fn validate(&self) -> Result<(), ArrowError> {
748        // Need at least this mich space in each buffer
749        let len_plus_offset = self.len + self.offset;
750
751        // Check that the data layout conforms to the spec
752        let layout = layout(&self.data_type);
753
754        if !layout.can_contain_null_mask && self.nulls.is_some() {
755            return Err(ArrowError::InvalidArgumentError(format!(
756                "Arrays of type {:?} cannot contain a null bitmask",
757                self.data_type,
758            )));
759        }
760
761        // Check data buffers length for view types and other types
762        if self.buffers.len() < layout.buffers.len()
763            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
764        {
765            return Err(ArrowError::InvalidArgumentError(format!(
766                "Expected {} buffers in array of type {:?}, got {}",
767                layout.buffers.len(),
768                self.data_type,
769                self.buffers.len(),
770            )));
771        }
772
773        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
774            match spec {
775                BufferSpec::FixedWidth {
776                    byte_width,
777                    alignment,
778                } => {
779                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
780
781                    if buffer.len() < min_buffer_size {
782                        return Err(ArrowError::InvalidArgumentError(format!(
783                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
784                            min_buffer_size,
785                            i,
786                            self.data_type,
787                            buffer.len()
788                        )));
789                    }
790
791                    let align_offset = buffer.as_ptr().align_offset(*alignment);
792                    if align_offset != 0 {
793                        return Err(ArrowError::InvalidArgumentError(format!(
794                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795                            self.data_type,
796                            align_offset.min(alignment - align_offset)
797                        )));
798                    }
799                }
800                BufferSpec::VariableWidth => {
801                    // not cheap to validate (need to look at the
802                    // data). Partially checked in validate_offsets
803                    // called below. Can check with `validate_full`
804                }
805                BufferSpec::BitMap => {
806                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
807                    if buffer.len() < min_buffer_size {
808                        return Err(ArrowError::InvalidArgumentError(format!(
809                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
810                            min_buffer_size,
811                            i,
812                            self.data_type,
813                            buffer.len()
814                        )));
815                    }
816                }
817                BufferSpec::AlwaysNull => {
818                    // Nothing to validate
819                }
820            }
821        }
822
823        // check null bit buffer size
824        if let Some(nulls) = self.nulls() {
825            if nulls.null_count() > self.len {
826                return Err(ArrowError::InvalidArgumentError(format!(
827                    "null_count {} for an array exceeds length of {} elements",
828                    nulls.null_count(),
829                    self.len
830                )));
831            }
832
833            let actual_len = nulls.validity().len();
834            let needed_len = bit_util::ceil(len_plus_offset, 8);
835            if actual_len < needed_len {
836                return Err(ArrowError::InvalidArgumentError(format!(
837                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
838                )));
839            }
840
841            if nulls.len() != self.len {
842                return Err(ArrowError::InvalidArgumentError(format!(
843                    "null buffer incorrect size. got {} expected {}",
844                    nulls.len(),
845                    self.len
846                )));
847            }
848        }
849
850        self.validate_child_data()?;
851
852        // Additional Type specific checks
853        match &self.data_type {
854            DataType::Utf8 | DataType::Binary => {
855                self.validate_offsets::<i32>(self.buffers[1].len())?;
856            }
857            DataType::LargeUtf8 | DataType::LargeBinary => {
858                self.validate_offsets::<i64>(self.buffers[1].len())?;
859            }
860            DataType::Dictionary(key_type, _value_type) => {
861                // At the moment, constructing a DictionaryArray will also check this
862                if !DataType::is_dictionary_key_type(key_type) {
863                    return Err(ArrowError::InvalidArgumentError(format!(
864                        "Dictionary key type must be integer, but was {key_type}"
865                    )));
866                }
867            }
868            DataType::RunEndEncoded(run_ends_type, _) => {
869                if run_ends_type.is_nullable() {
870                    return Err(ArrowError::InvalidArgumentError(
871                        "The nullable should be set to false for the field defining run_ends array.".to_string()
872                    ));
873                }
874                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
875                    return Err(ArrowError::InvalidArgumentError(format!(
876                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
877                        run_ends_type.data_type()
878                    )));
879                }
880            }
881            _ => {}
882        };
883
884        Ok(())
885    }
886
887    /// Returns a reference to the data in `buffer` as a typed slice
888    /// (typically `&[i32]` or `&[i64]`) after validating. The
889    /// returned slice is guaranteed to have at least `self.len + 1`
890    /// entries.
891    ///
892    /// For an empty array, the `buffer` can also be empty.
893    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
894        // An empty list-like array can have 0 offsets
895        if self.len == 0 && self.buffers[0].is_empty() {
896            return Ok(&[]);
897        }
898
899        self.typed_buffer(0, self.len + 1)
900    }
901
902    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
903    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
904        &self,
905        idx: usize,
906        len: usize,
907    ) -> Result<&[T], ArrowError> {
908        let buffer = &self.buffers[idx];
909
910        let required_len = (len + self.offset) * mem::size_of::<T>();
911
912        if buffer.len() < required_len {
913            return Err(ArrowError::InvalidArgumentError(format!(
914                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
915                idx,
916                self.data_type,
917                required_len,
918                buffer.len()
919            )));
920        }
921
922        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
923    }
924
925    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
926    /// offsets (of type T) into some other buffer of `values_length` bytes long
927    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
928        &self,
929        values_length: usize,
930    ) -> Result<(), ArrowError> {
931        // Justification: buffer size was validated above
932        let offsets = self.typed_offsets::<T>()?;
933        if offsets.is_empty() {
934            return Ok(());
935        }
936
937        let first_offset = offsets[0].to_usize().ok_or_else(|| {
938            ArrowError::InvalidArgumentError(format!(
939                "Error converting offset[0] ({}) to usize for {}",
940                offsets[0], self.data_type
941            ))
942        })?;
943
944        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
945            ArrowError::InvalidArgumentError(format!(
946                "Error converting offset[{}] ({}) to usize for {}",
947                self.len, offsets[self.len], self.data_type
948            ))
949        })?;
950
951        if first_offset > values_length {
952            return Err(ArrowError::InvalidArgumentError(format!(
953                "First offset {} of {} is larger than values length {}",
954                first_offset, self.data_type, values_length,
955            )));
956        }
957
958        if last_offset > values_length {
959            return Err(ArrowError::InvalidArgumentError(format!(
960                "Last offset {} of {} is larger than values length {}",
961                last_offset, self.data_type, values_length,
962            )));
963        }
964
965        if first_offset > last_offset {
966            return Err(ArrowError::InvalidArgumentError(format!(
967                "First offset {} in {} is smaller than last offset {}",
968                first_offset, self.data_type, last_offset,
969            )));
970        }
971
972        Ok(())
973    }
974
975    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
976    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
977    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
978        &self,
979        values_length: usize,
980    ) -> Result<(), ArrowError> {
981        let offsets: &[T] = self.typed_buffer(0, self.len)?;
982        let sizes: &[T] = self.typed_buffer(1, self.len)?;
983        if offsets.len() != sizes.len() {
984            return Err(ArrowError::ComputeError(format!(
985                "ListView offsets len {} does not match sizes len {}",
986                offsets.len(),
987                sizes.len()
988            )));
989        }
990
991        for i in 0..sizes.len() {
992            let size = sizes[i].to_usize().ok_or_else(|| {
993                ArrowError::InvalidArgumentError(format!(
994                    "Error converting size[{}] ({}) to usize for {}",
995                    i, sizes[i], self.data_type
996                ))
997            })?;
998            let offset = offsets[i].to_usize().ok_or_else(|| {
999                ArrowError::InvalidArgumentError(format!(
1000                    "Error converting offset[{}] ({}) to usize for {}",
1001                    i, offsets[i], self.data_type
1002                ))
1003            })?;
1004            if size
1005                .checked_add(offset)
1006                .expect("Offset and size have exceeded the usize boundary")
1007                > values_length
1008            {
1009                return Err(ArrowError::InvalidArgumentError(format!(
1010                    "Size {} at index {} is larger than the remaining values for {}",
1011                    size, i, self.data_type
1012                )));
1013            }
1014        }
1015        Ok(())
1016    }
1017
1018    /// Validates the layout of `child_data` ArrayData structures
1019    fn validate_child_data(&self) -> Result<(), ArrowError> {
1020        match &self.data_type {
1021            DataType::List(field) | DataType::Map(field, _) => {
1022                let values_data = self.get_single_valid_child_data(field.data_type())?;
1023                self.validate_offsets::<i32>(values_data.len)?;
1024                Ok(())
1025            }
1026            DataType::LargeList(field) => {
1027                let values_data = self.get_single_valid_child_data(field.data_type())?;
1028                self.validate_offsets::<i64>(values_data.len)?;
1029                Ok(())
1030            }
1031            DataType::ListView(field) => {
1032                let values_data = self.get_single_valid_child_data(field.data_type())?;
1033                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1034                Ok(())
1035            }
1036            DataType::LargeListView(field) => {
1037                let values_data = self.get_single_valid_child_data(field.data_type())?;
1038                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1039                Ok(())
1040            }
1041            DataType::FixedSizeList(field, list_size) => {
1042                let values_data = self.get_single_valid_child_data(field.data_type())?;
1043
1044                let list_size: usize = (*list_size).try_into().map_err(|_| {
1045                    ArrowError::InvalidArgumentError(format!(
1046                        "{} has a negative list_size {}",
1047                        self.data_type, list_size
1048                    ))
1049                })?;
1050
1051                let expected_values_len = self.len
1052                    .checked_mul(list_size)
1053                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1054
1055                if values_data.len < expected_values_len {
1056                    return Err(ArrowError::InvalidArgumentError(format!(
1057                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1058                        values_data.len, self.len, list_size, self.data_type
1059                    )));
1060                }
1061
1062                Ok(())
1063            }
1064            DataType::Struct(fields) => {
1065                self.validate_num_child_data(fields.len())?;
1066                for (i, field) in fields.iter().enumerate() {
1067                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1068
1069                    // Ensure child field has sufficient size
1070                    if field_data.len < self.len {
1071                        return Err(ArrowError::InvalidArgumentError(format!(
1072                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1073                            self.data_type,
1074                            i,
1075                            field.name(),
1076                            field_data.len,
1077                            self.len
1078                        )));
1079                    }
1080                }
1081                Ok(())
1082            }
1083            DataType::RunEndEncoded(run_ends_field, values_field) => {
1084                self.validate_num_child_data(2)?;
1085                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1086                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1087                if run_ends_data.len != values_data.len {
1088                    return Err(ArrowError::InvalidArgumentError(format!(
1089                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1090                        run_ends_data.len, values_data.len
1091                    )));
1092                }
1093                if run_ends_data.nulls.is_some() {
1094                    return Err(ArrowError::InvalidArgumentError(
1095                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1096                    ));
1097                }
1098                Ok(())
1099            }
1100            DataType::Union(fields, mode) => {
1101                self.validate_num_child_data(fields.len())?;
1102
1103                for (i, (_, field)) in fields.iter().enumerate() {
1104                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1105
1106                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1107                        return Err(ArrowError::InvalidArgumentError(format!(
1108                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1109                            i,
1110                            field_data.len,
1111                            self.len + self.offset
1112                        )));
1113                    }
1114                }
1115                Ok(())
1116            }
1117            DataType::Dictionary(_key_type, value_type) => {
1118                self.get_single_valid_child_data(value_type)?;
1119                Ok(())
1120            }
1121            _ => {
1122                // other types do not have child data
1123                if !self.child_data.is_empty() {
1124                    return Err(ArrowError::InvalidArgumentError(format!(
1125                        "Expected no child arrays for type {} but got {}",
1126                        self.data_type,
1127                        self.child_data.len()
1128                    )));
1129                }
1130                Ok(())
1131            }
1132        }
1133    }
1134
1135    /// Ensures that this array data has a single child_data with the
1136    /// expected type, and calls `validate()` on it. Returns a
1137    /// reference to that child_data
1138    fn get_single_valid_child_data(
1139        &self,
1140        expected_type: &DataType,
1141    ) -> Result<&ArrayData, ArrowError> {
1142        self.validate_num_child_data(1)?;
1143        self.get_valid_child_data(0, expected_type)
1144    }
1145
1146    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1147    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1148        if self.child_data.len() != expected_len {
1149            Err(ArrowError::InvalidArgumentError(format!(
1150                "Value data for {} should contain {} child data array(s), had {}",
1151                self.data_type,
1152                expected_len,
1153                self.child_data.len()
1154            )))
1155        } else {
1156            Ok(())
1157        }
1158    }
1159
1160    /// Ensures that `child_data[i]` has the expected type, calls
1161    /// `validate()` on it, and returns a reference to that child_data
1162    fn get_valid_child_data(
1163        &self,
1164        i: usize,
1165        expected_type: &DataType,
1166    ) -> Result<&ArrayData, ArrowError> {
1167        let values_data = self.child_data.get(i).ok_or_else(|| {
1168            ArrowError::InvalidArgumentError(format!(
1169                "{} did not have enough child arrays. Expected at least {} but had only {}",
1170                self.data_type,
1171                i + 1,
1172                self.child_data.len()
1173            ))
1174        })?;
1175
1176        if expected_type != &values_data.data_type {
1177            return Err(ArrowError::InvalidArgumentError(format!(
1178                "Child type mismatch for {}. Expected {} but child data had {}",
1179                self.data_type, expected_type, values_data.data_type
1180            )));
1181        }
1182
1183        values_data.validate()?;
1184        Ok(values_data)
1185    }
1186
1187    /// Validate that the data contained within this [`ArrayData`] is valid
1188    ///
1189    /// 1. Null count is correct
1190    /// 2. All offsets are valid
1191    /// 3. All String data is valid UTF-8
1192    /// 4. All dictionary offsets are valid
1193    ///
1194    /// Internally this calls:
1195    ///
1196    /// * [`Self::validate`]
1197    /// * [`Self::validate_nulls`]
1198    /// * [`Self::validate_values`]
1199    ///
1200    /// Note: this does not recurse into children, for a recursive variant
1201    /// see [`Self::validate_full`]
1202    pub fn validate_data(&self) -> Result<(), ArrowError> {
1203        self.validate()?;
1204
1205        self.validate_nulls()?;
1206        self.validate_values()?;
1207        Ok(())
1208    }
1209
1210    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1211    ///
1212    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1213    /// and all its children recursively
1214    pub fn validate_full(&self) -> Result<(), ArrowError> {
1215        self.validate_data()?;
1216        // validate all children recursively
1217        self.child_data
1218            .iter()
1219            .enumerate()
1220            .try_for_each(|(i, child_data)| {
1221                child_data.validate_full().map_err(|e| {
1222                    ArrowError::InvalidArgumentError(format!(
1223                        "{} child #{} invalid: {}",
1224                        self.data_type, i, e
1225                    ))
1226                })
1227            })?;
1228        Ok(())
1229    }
1230
1231    /// Validates the values stored within this [`ArrayData`] are valid
1232    /// without recursing into child [`ArrayData`]
1233    ///
1234    /// Does not (yet) check
1235    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1236    /// 2. the the null count is correct and that any
1237    /// 3. nullability requirements of its children are correct
1238    ///
1239    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1240    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1241        if let Some(nulls) = &self.nulls {
1242            let actual = nulls.len() - nulls.inner().count_set_bits();
1243            if actual != nulls.null_count() {
1244                return Err(ArrowError::InvalidArgumentError(format!(
1245                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1246                    nulls.null_count(),
1247                    actual
1248                )));
1249            }
1250        }
1251
1252        // In general non-nullable children should not contain nulls, however, for certain
1253        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1254        // space in the child. As such we permit nulls in the children in the corresponding
1255        // positions for such types
1256        match &self.data_type {
1257            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1258                if !f.is_nullable() {
1259                    self.validate_non_nullable(None, &self.child_data[0])?
1260                }
1261            }
1262            DataType::FixedSizeList(field, len) => {
1263                let child = &self.child_data[0];
1264                if !field.is_nullable() {
1265                    match &self.nulls {
1266                        Some(nulls) => {
1267                            let element_len = *len as usize;
1268                            let expanded = nulls.expand(element_len);
1269                            self.validate_non_nullable(Some(&expanded), child)?;
1270                        }
1271                        None => self.validate_non_nullable(None, child)?,
1272                    }
1273                }
1274            }
1275            DataType::Struct(fields) => {
1276                for (field, child) in fields.iter().zip(&self.child_data) {
1277                    if !field.is_nullable() {
1278                        self.validate_non_nullable(self.nulls(), child)?
1279                    }
1280                }
1281            }
1282            _ => {}
1283        }
1284
1285        Ok(())
1286    }
1287
1288    /// Verifies that `child` contains no nulls not present in `mask`
1289    fn validate_non_nullable(
1290        &self,
1291        mask: Option<&NullBuffer>,
1292        child: &ArrayData,
1293    ) -> Result<(), ArrowError> {
1294        let mask = match mask {
1295            Some(mask) => mask,
1296            None => {
1297                return match child.null_count() {
1298                    0 => Ok(()),
1299                    _ => Err(ArrowError::InvalidArgumentError(format!(
1300                        "non-nullable child of type {} contains nulls not present in parent {}",
1301                        child.data_type, self.data_type
1302                    ))),
1303                };
1304            }
1305        };
1306
1307        match child.nulls() {
1308            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1309                "non-nullable child of type {} contains nulls not present in parent",
1310                child.data_type
1311            ))),
1312            _ => Ok(()),
1313        }
1314    }
1315
1316    /// Validates the values stored within this [`ArrayData`] are valid
1317    /// without recursing into child [`ArrayData`]
1318    ///
1319    /// Does not (yet) check
1320    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1321    pub fn validate_values(&self) -> Result<(), ArrowError> {
1322        match &self.data_type {
1323            DataType::Utf8 => self.validate_utf8::<i32>(),
1324            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1325            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1326            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1327            DataType::BinaryView => {
1328                let views = self.typed_buffer::<u128>(0, self.len)?;
1329                validate_binary_view(views, &self.buffers[1..])
1330            }
1331            DataType::Utf8View => {
1332                let views = self.typed_buffer::<u128>(0, self.len)?;
1333                validate_string_view(views, &self.buffers[1..])
1334            }
1335            DataType::List(_) | DataType::Map(_, _) => {
1336                let child = &self.child_data[0];
1337                self.validate_offsets_full::<i32>(child.len)
1338            }
1339            DataType::LargeList(_) => {
1340                let child = &self.child_data[0];
1341                self.validate_offsets_full::<i64>(child.len)
1342            }
1343            DataType::Union(_, _) => {
1344                // Validate Union Array as part of implementing new Union semantics
1345                // See comments in `ArrayData::validate()`
1346                // https://github.com/apache/arrow-rs/issues/85
1347                //
1348                // TODO file follow on ticket for full union validation
1349                Ok(())
1350            }
1351            DataType::Dictionary(key_type, _value_type) => {
1352                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1353                let max_value = dictionary_length - 1;
1354                match key_type.as_ref() {
1355                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1356                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1357                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1358                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1359                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1360                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1361                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1362                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1363                    _ => unreachable!(),
1364                }
1365            }
1366            DataType::RunEndEncoded(run_ends, _values) => {
1367                let run_ends_data = self.child_data()[0].clone();
1368                match run_ends.data_type() {
1369                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1370                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1371                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1372                    _ => unreachable!(),
1373                }
1374            }
1375            _ => {
1376                // No extra validation check required for other types
1377                Ok(())
1378            }
1379        }
1380    }
1381
1382    /// Calls the `validate(item_index, range)` function for each of
1383    /// the ranges specified in the arrow offsets buffer of type
1384    /// `T`. Also validates that each offset is smaller than
1385    /// `offset_limit`
1386    ///
1387    /// For an empty array, the offsets buffer can either be empty
1388    /// or contain a single `0`.
1389    ///
1390    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1391    /// function would call `validate([1,2])`, and `validate([2,4])`
1392    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1393    where
1394        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1395        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1396    {
1397        self.typed_offsets::<T>()?
1398            .iter()
1399            .enumerate()
1400            .map(|(i, x)| {
1401                // check if the offset can be converted to usize
1402                let r = x.to_usize().ok_or_else(|| {
1403                    ArrowError::InvalidArgumentError(format!(
1404                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1405                    );
1406                // check if the offset exceeds the limit
1407                match r {
1408                    Ok(n) if n <= offset_limit => Ok((i, n)),
1409                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1410                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1411                    ),
1412                    Err(e) => Err(e),
1413                }
1414            })
1415            .scan(0_usize, |start, end| {
1416                // check offsets are monotonically increasing
1417                match end {
1418                    Ok((i, end)) if *start <= end => {
1419                        let range = Some(Ok((i, *start..end)));
1420                        *start = end;
1421                        range
1422                    }
1423                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1424                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1425                        i - 1, start, end))
1426                    )),
1427                    Err(err) => Some(Err(err)),
1428                }
1429            })
1430            .skip(1) // the first element is meaningless
1431            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1432                let (item_index, range) = res?;
1433                validate(item_index-1, range)
1434            })
1435    }
1436
1437    /// Ensures that all strings formed by the offsets in `buffers[0]`
1438    /// into `buffers[1]` are valid utf8 sequences
1439    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1440    where
1441        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1442    {
1443        let values_buffer = &self.buffers[1].as_slice();
1444        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1445            // Validate Offsets are correct
1446            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1447                if !values_str.is_char_boundary(range.start)
1448                    || !values_str.is_char_boundary(range.end)
1449                {
1450                    return Err(ArrowError::InvalidArgumentError(format!(
1451                        "incomplete utf-8 byte sequence from index {string_index}"
1452                    )));
1453                }
1454                Ok(())
1455            })
1456        } else {
1457            // find specific offset that failed utf8 validation
1458            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1459                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1460                    ArrowError::InvalidArgumentError(format!(
1461                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1462                    ))
1463                })?;
1464                Ok(())
1465            })
1466        }
1467    }
1468
1469    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1470    /// between `0` and `offset_limit`
1471    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1472    where
1473        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1474    {
1475        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1476            // No validation applied to each value, but the iteration
1477            // itself applies bounds checking to each range
1478            Ok(())
1479        })
1480    }
1481
1482    /// Validates that each value in self.buffers (typed as T)
1483    /// is within the range [0, max_value], inclusive
1484    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1485    where
1486        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1487    {
1488        let required_len = self.len + self.offset;
1489        let buffer = &self.buffers[0];
1490
1491        // This should have been checked as part of `validate()` prior
1492        // to calling `validate_full()` but double check to be sure
1493        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1494
1495        // Justification: buffer size was validated above
1496        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1497
1498        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1499            // Do not check the value is null (value can be arbitrary)
1500            if self.is_null(i) {
1501                return Ok(());
1502            }
1503            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1504                ArrowError::InvalidArgumentError(format!(
1505                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1506                ))
1507            })?;
1508
1509            if dict_index < 0 || dict_index > max_value {
1510                return Err(ArrowError::InvalidArgumentError(format!(
1511                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1512                )));
1513            }
1514            Ok(())
1515        })
1516    }
1517
1518    /// Validates that each value in run_ends array is positive and strictly increasing.
1519    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1520    where
1521        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1522    {
1523        let values = self.typed_buffer::<T>(0, self.len)?;
1524        let mut prev_value: i64 = 0_i64;
1525        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1526            let value: i64 = inp_value.try_into().map_err(|_| {
1527                ArrowError::InvalidArgumentError(format!(
1528                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1529                ))
1530            })?;
1531            if value <= 0_i64 {
1532                return Err(ArrowError::InvalidArgumentError(format!(
1533                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1534                )));
1535            }
1536            if ix > 0 && value <= prev_value {
1537                return Err(ArrowError::InvalidArgumentError(format!(
1538                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1539                )));
1540            }
1541
1542            prev_value = value;
1543            Ok(())
1544        })?;
1545
1546        if prev_value.as_usize() < (self.offset + self.len) {
1547            return Err(ArrowError::InvalidArgumentError(format!(
1548                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1549                self.offset + self.len
1550            )));
1551        }
1552        Ok(())
1553    }
1554
1555    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1556    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1557    /// return false when the arrays are logically equal
1558    pub fn ptr_eq(&self, other: &Self) -> bool {
1559        if self.offset != other.offset
1560            || self.len != other.len
1561            || self.data_type != other.data_type
1562            || self.buffers.len() != other.buffers.len()
1563            || self.child_data.len() != other.child_data.len()
1564        {
1565            return false;
1566        }
1567
1568        match (&self.nulls, &other.nulls) {
1569            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1570            (Some(_), None) | (None, Some(_)) => return false,
1571            _ => {}
1572        };
1573
1574        if !self
1575            .buffers
1576            .iter()
1577            .zip(other.buffers.iter())
1578            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1579        {
1580            return false;
1581        }
1582
1583        self.child_data
1584            .iter()
1585            .zip(other.child_data.iter())
1586            .all(|(a, b)| a.ptr_eq(b))
1587    }
1588
1589    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1590    pub fn into_builder(self) -> ArrayDataBuilder {
1591        self.into()
1592    }
1593}
1594
1595/// Return the expected [`DataTypeLayout`] Arrays of this data
1596/// type are expected to have
1597pub fn layout(data_type: &DataType) -> DataTypeLayout {
1598    // based on C/C++ implementation in
1599    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1600    use arrow_schema::IntervalUnit::*;
1601
1602    match data_type {
1603        DataType::Null => DataTypeLayout {
1604            buffers: vec![],
1605            can_contain_null_mask: false,
1606            variadic: false,
1607        },
1608        DataType::Boolean => DataTypeLayout {
1609            buffers: vec![BufferSpec::BitMap],
1610            can_contain_null_mask: true,
1611            variadic: false,
1612        },
1613        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1614        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1615        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1616        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1617        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1618        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1619        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1620        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1621        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1622        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1623        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1624        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1625        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1626        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1627        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1628        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1629        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1630        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1631        DataType::Interval(MonthDayNano) => {
1632            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1633        }
1634        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1635        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1636        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1637        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1638        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1639        DataType::FixedSizeBinary(size) => {
1640            let spec = BufferSpec::FixedWidth {
1641                byte_width: (*size).try_into().unwrap(),
1642                alignment: mem::align_of::<u8>(),
1643            };
1644            DataTypeLayout {
1645                buffers: vec![spec],
1646                can_contain_null_mask: true,
1647                variadic: false,
1648            }
1649        }
1650        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1651        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1652        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1653        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1654        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1655        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1656        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1657        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1658        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1659        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1660        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1661        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1662        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1663        DataType::Union(_, mode) => {
1664            let type_ids = BufferSpec::FixedWidth {
1665                byte_width: mem::size_of::<i8>(),
1666                alignment: mem::align_of::<i8>(),
1667            };
1668
1669            DataTypeLayout {
1670                buffers: match mode {
1671                    UnionMode::Sparse => {
1672                        vec![type_ids]
1673                    }
1674                    UnionMode::Dense => {
1675                        vec![
1676                            type_ids,
1677                            BufferSpec::FixedWidth {
1678                                byte_width: mem::size_of::<i32>(),
1679                                alignment: mem::align_of::<i32>(),
1680                            },
1681                        ]
1682                    }
1683                },
1684                can_contain_null_mask: false,
1685                variadic: false,
1686            }
1687        }
1688        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1689    }
1690}
1691
1692/// Layout specification for a data type
1693#[derive(Debug, PartialEq, Eq)]
1694// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1695pub struct DataTypeLayout {
1696    /// A vector of buffer layout specifications, one for each expected buffer
1697    pub buffers: Vec<BufferSpec>,
1698
1699    /// Can contain a null bitmask
1700    pub can_contain_null_mask: bool,
1701
1702    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1703    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1704    /// buffers.len(). Buffers that exceed the lower bound are legal.
1705    pub variadic: bool,
1706}
1707
1708impl DataTypeLayout {
1709    /// Describes a basic numeric array where each element has type `T`
1710    pub fn new_fixed_width<T>() -> Self {
1711        Self {
1712            buffers: vec![BufferSpec::FixedWidth {
1713                byte_width: mem::size_of::<T>(),
1714                alignment: mem::align_of::<T>(),
1715            }],
1716            can_contain_null_mask: true,
1717            variadic: false,
1718        }
1719    }
1720
1721    /// Describes arrays which have no data of their own
1722    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1723    pub fn new_nullable_empty() -> Self {
1724        Self {
1725            buffers: vec![],
1726            can_contain_null_mask: true,
1727            variadic: false,
1728        }
1729    }
1730
1731    /// Describes arrays which have no data of their own
1732    /// (e.g. RunEndEncoded).
1733    pub fn new_empty() -> Self {
1734        Self {
1735            buffers: vec![],
1736            can_contain_null_mask: false,
1737            variadic: false,
1738        }
1739    }
1740
1741    /// Describes a basic numeric array where each element has a fixed
1742    /// with offset buffer of type `T`, followed by a
1743    /// variable width data buffer
1744    pub fn new_binary<T>() -> Self {
1745        Self {
1746            buffers: vec![
1747                // offsets
1748                BufferSpec::FixedWidth {
1749                    byte_width: mem::size_of::<T>(),
1750                    alignment: mem::align_of::<T>(),
1751                },
1752                // values
1753                BufferSpec::VariableWidth,
1754            ],
1755            can_contain_null_mask: true,
1756            variadic: false,
1757        }
1758    }
1759
1760    /// Describes a view type
1761    pub fn new_view() -> Self {
1762        Self {
1763            buffers: vec![BufferSpec::FixedWidth {
1764                byte_width: mem::size_of::<u128>(),
1765                alignment: mem::align_of::<u128>(),
1766            }],
1767            can_contain_null_mask: true,
1768            variadic: true,
1769        }
1770    }
1771
1772    /// Describes a list view type
1773    pub fn new_list_view<T>() -> Self {
1774        Self {
1775            buffers: vec![
1776                BufferSpec::FixedWidth {
1777                    byte_width: mem::size_of::<T>(),
1778                    alignment: mem::align_of::<T>(),
1779                },
1780                BufferSpec::FixedWidth {
1781                    byte_width: mem::size_of::<T>(),
1782                    alignment: mem::align_of::<T>(),
1783                },
1784            ],
1785            can_contain_null_mask: true,
1786            variadic: true,
1787        }
1788    }
1789}
1790
1791/// Layout specification for a single data type buffer
1792#[derive(Debug, PartialEq, Eq)]
1793pub enum BufferSpec {
1794    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1795    ///
1796    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1797    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1798    ///
1799    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1800    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1801    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1802    ///
1803    /// Note that these alignment requirements will vary between architectures
1804    FixedWidth {
1805        /// The width of each element in bytes
1806        byte_width: usize,
1807        /// The alignment required by Rust for an array of the corresponding primitive
1808        alignment: usize,
1809    },
1810    /// Variable width, such as string data for utf8 data
1811    VariableWidth,
1812    /// Buffer holds a bitmap.
1813    ///
1814    /// Note: Unlike the C++ implementation, the null/validity buffer
1815    /// is handled specially rather than as another of the buffers in
1816    /// the spec, so this variant is only used for the Boolean type.
1817    BitMap,
1818    /// Buffer is always null. Unused currently in Rust implementation,
1819    /// (used in C++ for Union type)
1820    #[allow(dead_code)]
1821    AlwaysNull,
1822}
1823
1824impl PartialEq for ArrayData {
1825    fn eq(&self, other: &Self) -> bool {
1826        equal::equal(self, other)
1827    }
1828}
1829
1830/// A boolean flag that cannot be mutated outside of unsafe code.
1831///
1832/// Defaults to a value of false.
1833///
1834/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1835///
1836/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1837///
1838/// # Example
1839/// ```rust
1840/// use arrow_data::UnsafeFlag;
1841/// assert!(!UnsafeFlag::default().get()); // default is false
1842/// let mut flag = UnsafeFlag::new();
1843/// assert!(!flag.get()); // defaults to false
1844/// // can only set it to true in unsafe code
1845/// unsafe { flag.set(true) };
1846/// assert!(flag.get()); // now true
1847/// ```
1848#[derive(Debug, Clone)]
1849#[doc(hidden)]
1850pub struct UnsafeFlag(bool);
1851
1852impl UnsafeFlag {
1853    /// Creates a new `UnsafeFlag` with the value set to `false`.
1854    ///
1855    /// See examples on [`Self::new`]
1856    #[inline]
1857    pub const fn new() -> Self {
1858        Self(false)
1859    }
1860
1861    /// Sets the value of the flag to the given value
1862    ///
1863    /// Note this can purposely only be done in `unsafe` code
1864    ///
1865    /// # Safety
1866    ///
1867    /// If set, the flag will be set to the given value. There is nothing
1868    /// immediately unsafe about doing so, however, the flag can be used to
1869    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1870    #[inline]
1871    pub unsafe fn set(&mut self, val: bool) {
1872        self.0 = val;
1873    }
1874
1875    /// Returns the value of the flag
1876    #[inline]
1877    pub fn get(&self) -> bool {
1878        self.0
1879    }
1880}
1881
1882// Manual impl to make it clear you can not construct unsafe with true
1883impl Default for UnsafeFlag {
1884    fn default() -> Self {
1885        Self::new()
1886    }
1887}
1888
1889/// Builder for [`ArrayData`] type
1890#[derive(Debug)]
1891pub struct ArrayDataBuilder {
1892    data_type: DataType,
1893    len: usize,
1894    null_count: Option<usize>,
1895    null_bit_buffer: Option<Buffer>,
1896    nulls: Option<NullBuffer>,
1897    offset: usize,
1898    buffers: Vec<Buffer>,
1899    child_data: Vec<ArrayData>,
1900    /// Should buffers be realigned (copying if necessary)?
1901    ///
1902    /// Defaults to false.
1903    align_buffers: bool,
1904    /// Should data validation be skipped for this [`ArrayData`]?
1905    ///
1906    /// Defaults to false.
1907    ///
1908    /// # Safety
1909    ///
1910    /// This flag can only be set to true using `unsafe` APIs. However, once true
1911    /// subsequent calls to `build()` may result in undefined behavior if the data
1912    /// is not valid.
1913    skip_validation: UnsafeFlag,
1914}
1915
1916impl ArrayDataBuilder {
1917    #[inline]
1918    /// Creates a new array data builder
1919    pub const fn new(data_type: DataType) -> Self {
1920        Self {
1921            data_type,
1922            len: 0,
1923            null_count: None,
1924            null_bit_buffer: None,
1925            nulls: None,
1926            offset: 0,
1927            buffers: vec![],
1928            child_data: vec![],
1929            align_buffers: false,
1930            skip_validation: UnsafeFlag::new(),
1931        }
1932    }
1933
1934    /// Creates a new array data builder from an existing one, changing the data type
1935    pub fn data_type(self, data_type: DataType) -> Self {
1936        Self { data_type, ..self }
1937    }
1938
1939    #[inline]
1940    #[allow(clippy::len_without_is_empty)]
1941    /// Sets the length of the [ArrayData]
1942    pub const fn len(mut self, n: usize) -> Self {
1943        self.len = n;
1944        self
1945    }
1946
1947    /// Sets the null buffer of the [ArrayData]
1948    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1949        self.nulls = nulls;
1950        self.null_count = None;
1951        self.null_bit_buffer = None;
1952        self
1953    }
1954
1955    /// Sets the null count of the [ArrayData]
1956    pub fn null_count(mut self, null_count: usize) -> Self {
1957        self.null_count = Some(null_count);
1958        self
1959    }
1960
1961    /// Sets the `null_bit_buffer` of the [ArrayData]
1962    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1963        self.nulls = None;
1964        self.null_bit_buffer = buf;
1965        self
1966    }
1967
1968    /// Sets the offset of the [ArrayData]
1969    #[inline]
1970    pub const fn offset(mut self, n: usize) -> Self {
1971        self.offset = n;
1972        self
1973    }
1974
1975    /// Sets the buffers of the [ArrayData]
1976    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1977        self.buffers = v;
1978        self
1979    }
1980
1981    /// Adds a single buffer to the [ArrayData]'s buffers
1982    pub fn add_buffer(mut self, b: Buffer) -> Self {
1983        self.buffers.push(b);
1984        self
1985    }
1986
1987    /// Adds multiple buffers to the [ArrayData]'s buffers
1988    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1989        self.buffers.extend(bs);
1990        self
1991    }
1992
1993    /// Sets the child data of the [ArrayData]
1994    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1995        self.child_data = v;
1996        self
1997    }
1998
1999    /// Adds a single child data to the [ArrayData]'s child data
2000    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2001        self.child_data.push(r);
2002        self
2003    }
2004
2005    /// Creates an array data, without any validation
2006    ///
2007    /// Note: This is shorthand for
2008    /// ```rust
2009    /// # #[expect(unsafe_op_in_unsafe_fn)]
2010    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2011    /// # let _ = unsafe {
2012    /// builder.skip_validation(true).build().unwrap()
2013    /// # };
2014    /// ```
2015    ///
2016    /// # Safety
2017    ///
2018    /// The same caveats as [`ArrayData::new_unchecked`]
2019    /// apply.
2020    pub unsafe fn build_unchecked(self) -> ArrayData {
2021        unsafe { self.skip_validation(true) }.build().unwrap()
2022    }
2023
2024    /// Creates an `ArrayData`, consuming `self`
2025    ///
2026    /// # Safety
2027    ///
2028    /// By default the underlying buffers are checked to ensure they are valid
2029    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2030    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2031    /// not valid, undefined behavior will result.
2032    pub fn build(self) -> Result<ArrayData, ArrowError> {
2033        let Self {
2034            data_type,
2035            len,
2036            null_count,
2037            null_bit_buffer,
2038            nulls,
2039            offset,
2040            buffers,
2041            child_data,
2042            align_buffers,
2043            skip_validation,
2044        } = self;
2045
2046        let nulls = nulls
2047            .or_else(|| {
2048                let buffer = null_bit_buffer?;
2049                let buffer = BooleanBuffer::new(buffer, offset, len);
2050                Some(match null_count {
2051                    Some(n) => {
2052                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2053                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2054                    }
2055                    None => NullBuffer::new(buffer),
2056                })
2057            })
2058            .filter(|b| b.null_count() != 0);
2059
2060        let mut data = ArrayData {
2061            data_type,
2062            len,
2063            offset,
2064            buffers,
2065            child_data,
2066            nulls,
2067        };
2068
2069        if align_buffers {
2070            data.align_buffers();
2071        }
2072
2073        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2074        if !skip_validation.get() || cfg!(feature = "force_validate") {
2075            data.validate_data()?;
2076        }
2077        Ok(data)
2078    }
2079
2080    /// Creates an array data, validating all inputs, and aligning any buffers
2081    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2082    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2083        self.align_buffers(true).build()
2084    }
2085
2086    /// Ensure that all buffers are aligned, copying data if necessary
2087    ///
2088    /// Rust requires that arrays are aligned to their corresponding primitive,
2089    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2090    ///
2091    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2092    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2093    ///
2094    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2095    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2096    /// when necessary, making it useful when interacting with buffers produced by other systems,
2097    /// e.g. IPC or FFI.
2098    ///
2099    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2100    /// insufficiently aligned buffers.
2101    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2102        self.align_buffers = align_buffers;
2103        self
2104    }
2105
2106    /// Skips validation of the data.
2107    ///
2108    /// If this flag is enabled, `[Self::build`] will skip validation of the
2109    /// data
2110    ///
2111    /// If this flag is not enabled, `[Self::build`] will validate that all
2112    /// buffers are valid and will return an error if any data is invalid.
2113    /// Validation can be expensive.
2114    ///
2115    /// # Safety
2116    ///
2117    /// If validation is skipped, the buffers must form a valid Arrow array,
2118    /// otherwise undefined behavior will result
2119    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2120        unsafe {
2121            self.skip_validation.set(skip_validation);
2122        }
2123        self
2124    }
2125}
2126
2127impl From<ArrayData> for ArrayDataBuilder {
2128    fn from(d: ArrayData) -> Self {
2129        Self {
2130            data_type: d.data_type,
2131            len: d.len,
2132            offset: d.offset,
2133            buffers: d.buffers,
2134            child_data: d.child_data,
2135            nulls: d.nulls,
2136            null_bit_buffer: None,
2137            null_count: None,
2138            align_buffers: false,
2139            skip_validation: UnsafeFlag::new(),
2140        }
2141    }
2142}
2143
2144#[cfg(test)]
2145mod tests {
2146    use super::*;
2147    use arrow_schema::{Field, Fields};
2148
2149    // See arrow/tests/array_data_validation.rs for test of array validation
2150
2151    /// returns a buffer initialized with some constant value for tests
2152    fn make_i32_buffer(n: usize) -> Buffer {
2153        Buffer::from_slice_ref(vec![42i32; n])
2154    }
2155
2156    /// returns a buffer initialized with some constant value for tests
2157    fn make_f32_buffer(n: usize) -> Buffer {
2158        Buffer::from_slice_ref(vec![42f32; n])
2159    }
2160
2161    #[test]
2162    fn test_builder() {
2163        // Buffer needs to be at least 25 long
2164        let v = (0..25).collect::<Vec<i32>>();
2165        let b1 = Buffer::from_slice_ref(&v);
2166        let arr_data = ArrayData::builder(DataType::Int32)
2167            .len(20)
2168            .offset(5)
2169            .add_buffer(b1)
2170            .null_bit_buffer(Some(Buffer::from([
2171                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2172            ])))
2173            .build()
2174            .unwrap();
2175
2176        assert_eq!(20, arr_data.len());
2177        assert_eq!(10, arr_data.null_count());
2178        assert_eq!(5, arr_data.offset());
2179        assert_eq!(1, arr_data.buffers().len());
2180        assert_eq!(
2181            Buffer::from_slice_ref(&v).as_slice(),
2182            arr_data.buffers()[0].as_slice()
2183        );
2184    }
2185
2186    #[test]
2187    fn test_builder_with_child_data() {
2188        let child_arr_data = ArrayData::try_new(
2189            DataType::Int32,
2190            5,
2191            None,
2192            0,
2193            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2194            vec![],
2195        )
2196        .unwrap();
2197
2198        let field = Arc::new(Field::new("x", DataType::Int32, true));
2199        let data_type = DataType::Struct(vec![field].into());
2200
2201        let arr_data = ArrayData::builder(data_type)
2202            .len(5)
2203            .offset(0)
2204            .add_child_data(child_arr_data.clone())
2205            .build()
2206            .unwrap();
2207
2208        assert_eq!(5, arr_data.len());
2209        assert_eq!(1, arr_data.child_data().len());
2210        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2211    }
2212
2213    #[test]
2214    fn test_null_count() {
2215        let mut bit_v: [u8; 2] = [0; 2];
2216        bit_util::set_bit(&mut bit_v, 0);
2217        bit_util::set_bit(&mut bit_v, 3);
2218        bit_util::set_bit(&mut bit_v, 10);
2219        let arr_data = ArrayData::builder(DataType::Int32)
2220            .len(16)
2221            .add_buffer(make_i32_buffer(16))
2222            .null_bit_buffer(Some(Buffer::from(bit_v)))
2223            .build()
2224            .unwrap();
2225        assert_eq!(13, arr_data.null_count());
2226
2227        // Test with offset
2228        let mut bit_v: [u8; 2] = [0; 2];
2229        bit_util::set_bit(&mut bit_v, 0);
2230        bit_util::set_bit(&mut bit_v, 3);
2231        bit_util::set_bit(&mut bit_v, 10);
2232        let arr_data = ArrayData::builder(DataType::Int32)
2233            .len(12)
2234            .offset(2)
2235            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2236            .null_bit_buffer(Some(Buffer::from(bit_v)))
2237            .build()
2238            .unwrap();
2239        assert_eq!(10, arr_data.null_count());
2240    }
2241
2242    #[test]
2243    fn test_null_buffer_ref() {
2244        let mut bit_v: [u8; 2] = [0; 2];
2245        bit_util::set_bit(&mut bit_v, 0);
2246        bit_util::set_bit(&mut bit_v, 3);
2247        bit_util::set_bit(&mut bit_v, 10);
2248        let arr_data = ArrayData::builder(DataType::Int32)
2249            .len(16)
2250            .add_buffer(make_i32_buffer(16))
2251            .null_bit_buffer(Some(Buffer::from(bit_v)))
2252            .build()
2253            .unwrap();
2254        assert!(arr_data.nulls().is_some());
2255        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2256    }
2257
2258    #[test]
2259    fn test_slice() {
2260        let mut bit_v: [u8; 2] = [0; 2];
2261        bit_util::set_bit(&mut bit_v, 0);
2262        bit_util::set_bit(&mut bit_v, 3);
2263        bit_util::set_bit(&mut bit_v, 10);
2264        let data = ArrayData::builder(DataType::Int32)
2265            .len(16)
2266            .add_buffer(make_i32_buffer(16))
2267            .null_bit_buffer(Some(Buffer::from(bit_v)))
2268            .build()
2269            .unwrap();
2270        let new_data = data.slice(1, 15);
2271        assert_eq!(data.len() - 1, new_data.len());
2272        assert_eq!(1, new_data.offset());
2273        assert_eq!(data.null_count(), new_data.null_count());
2274
2275        // slice of a slice (removes one null)
2276        let new_data = new_data.slice(1, 14);
2277        assert_eq!(data.len() - 2, new_data.len());
2278        assert_eq!(2, new_data.offset());
2279        assert_eq!(data.null_count() - 1, new_data.null_count());
2280    }
2281
2282    #[test]
2283    fn test_equality() {
2284        let int_data = ArrayData::builder(DataType::Int32)
2285            .len(1)
2286            .add_buffer(make_i32_buffer(1))
2287            .build()
2288            .unwrap();
2289
2290        let float_data = ArrayData::builder(DataType::Float32)
2291            .len(1)
2292            .add_buffer(make_f32_buffer(1))
2293            .build()
2294            .unwrap();
2295        assert_ne!(int_data, float_data);
2296        assert!(!int_data.ptr_eq(&float_data));
2297        assert!(int_data.ptr_eq(&int_data));
2298
2299        #[allow(clippy::redundant_clone)]
2300        let int_data_clone = int_data.clone();
2301        assert_eq!(int_data, int_data_clone);
2302        assert!(int_data.ptr_eq(&int_data_clone));
2303        assert!(int_data_clone.ptr_eq(&int_data));
2304
2305        let int_data_slice = int_data_clone.slice(1, 0);
2306        assert!(int_data_slice.ptr_eq(&int_data_slice));
2307        assert!(!int_data.ptr_eq(&int_data_slice));
2308        assert!(!int_data_slice.ptr_eq(&int_data));
2309
2310        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2311        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2312        let string_data = ArrayData::try_new(
2313            DataType::Utf8,
2314            3,
2315            Some(Buffer::from_iter(vec![true, false, true])),
2316            0,
2317            vec![offsets_buffer, data_buffer],
2318            vec![],
2319        )
2320        .unwrap();
2321
2322        assert_ne!(float_data, string_data);
2323        assert!(!float_data.ptr_eq(&string_data));
2324
2325        assert!(string_data.ptr_eq(&string_data));
2326
2327        #[allow(clippy::redundant_clone)]
2328        let string_data_cloned = string_data.clone();
2329        assert!(string_data_cloned.ptr_eq(&string_data));
2330        assert!(string_data.ptr_eq(&string_data_cloned));
2331
2332        let string_data_slice = string_data.slice(1, 2);
2333        assert!(string_data_slice.ptr_eq(&string_data_slice));
2334        assert!(!string_data_slice.ptr_eq(&string_data))
2335    }
2336
2337    #[test]
2338    fn test_slice_memory_size() {
2339        let mut bit_v: [u8; 2] = [0; 2];
2340        bit_util::set_bit(&mut bit_v, 0);
2341        bit_util::set_bit(&mut bit_v, 3);
2342        bit_util::set_bit(&mut bit_v, 10);
2343        let data = ArrayData::builder(DataType::Int32)
2344            .len(16)
2345            .add_buffer(make_i32_buffer(16))
2346            .null_bit_buffer(Some(Buffer::from(bit_v)))
2347            .build()
2348            .unwrap();
2349        let new_data = data.slice(1, 14);
2350        assert_eq!(
2351            data.get_slice_memory_size().unwrap() - 8,
2352            new_data.get_slice_memory_size().unwrap()
2353        );
2354        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2355        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2356        let string_data = ArrayData::try_new(
2357            DataType::Utf8,
2358            3,
2359            Some(Buffer::from_iter(vec![true, false, true])),
2360            0,
2361            vec![offsets_buffer, data_buffer],
2362            vec![],
2363        )
2364        .unwrap();
2365        let string_data_slice = string_data.slice(1, 2);
2366        //4 bytes of offset and 2 bytes of data reduced by slicing.
2367        assert_eq!(
2368            string_data.get_slice_memory_size().unwrap() - 6,
2369            string_data_slice.get_slice_memory_size().unwrap()
2370        );
2371    }
2372
2373    #[test]
2374    fn test_count_nulls() {
2375        let buffer = Buffer::from([0b00010110, 0b10011111]);
2376        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2377        let count = count_nulls(Some(&buffer), 0, 16);
2378        assert_eq!(count, 7);
2379
2380        let count = count_nulls(Some(&buffer), 4, 8);
2381        assert_eq!(count, 3);
2382    }
2383
2384    #[test]
2385    fn test_contains_nulls() {
2386        let buffer: Buffer =
2387            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2388        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2389        assert!(contains_nulls(Some(&buffer), 0, 6));
2390        assert!(contains_nulls(Some(&buffer), 0, 3));
2391        assert!(!contains_nulls(Some(&buffer), 3, 2));
2392        assert!(!contains_nulls(Some(&buffer), 0, 0));
2393    }
2394
2395    #[test]
2396    fn test_alignment() {
2397        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2398        let sliced = buffer.slice(1);
2399
2400        let mut data = ArrayData {
2401            data_type: DataType::Int32,
2402            len: 0,
2403            offset: 0,
2404            buffers: vec![buffer],
2405            child_data: vec![],
2406            nulls: None,
2407        };
2408        data.validate_full().unwrap();
2409
2410        // break alignment in data
2411        data.buffers[0] = sliced;
2412        let err = data.validate().unwrap_err();
2413
2414        assert_eq!(
2415            err.to_string(),
2416            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2417        );
2418
2419        data.align_buffers();
2420        data.validate_full().unwrap();
2421    }
2422
2423    #[test]
2424    fn test_alignment_struct() {
2425        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2426        let sliced = buffer.slice(1);
2427
2428        let child_data = ArrayData {
2429            data_type: DataType::Int32,
2430            len: 0,
2431            offset: 0,
2432            buffers: vec![buffer],
2433            child_data: vec![],
2434            nulls: None,
2435        };
2436
2437        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2438        let mut data = ArrayData {
2439            data_type: schema,
2440            len: 0,
2441            offset: 0,
2442            buffers: vec![],
2443            child_data: vec![child_data],
2444            nulls: None,
2445        };
2446        data.validate_full().unwrap();
2447
2448        // break alignment in child data
2449        data.child_data[0].buffers[0] = sliced;
2450        let err = data.validate().unwrap_err();
2451
2452        assert_eq!(
2453            err.to_string(),
2454            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2455        );
2456
2457        data.align_buffers();
2458        data.validate_full().unwrap();
2459    }
2460
2461    #[test]
2462    fn test_null_view_types() {
2463        let array_len = 32;
2464        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2465        assert_eq!(array.len(), array_len);
2466        for i in 0..array.len() {
2467            assert!(array.is_null(i));
2468        }
2469
2470        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2471        assert_eq!(array.len(), array_len);
2472        for i in 0..array.len() {
2473            assert!(array.is_null(i));
2474        }
2475    }
2476}