arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal128(_, _)
87        | DataType::Decimal256(_, _)
88        | DataType::Date32
89        | DataType::Time32(_)
90        | DataType::Date64
91        | DataType::Time64(_)
92        | DataType::Duration(_)
93        | DataType::Timestamp(_, _)
94        | DataType::Interval(_) => [
95            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
96            empty_buffer,
97        ],
98        DataType::Utf8 | DataType::Binary => {
99            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
100            // safety: `unsafe` code assumes that this buffer is initialized with one element
101            buffer.push(0i32);
102            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
103        }
104        DataType::LargeUtf8 | DataType::LargeBinary => {
105            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
106            // safety: `unsafe` code assumes that this buffer is initialized with one element
107            buffer.push(0i64);
108            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
109        }
110        DataType::BinaryView | DataType::Utf8View => [
111            MutableBuffer::new(capacity * mem::size_of::<u128>()),
112            empty_buffer,
113        ],
114        DataType::List(_) | DataType::Map(_, _) => {
115            // offset buffer always starts with a zero
116            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
117            buffer.push(0i32);
118            [buffer, empty_buffer]
119        }
120        DataType::ListView(_) => [
121            MutableBuffer::new(capacity * mem::size_of::<i32>()),
122            MutableBuffer::new(capacity * mem::size_of::<i32>()),
123        ],
124        DataType::LargeList(_) => {
125            // offset buffer always starts with a zero
126            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
127            buffer.push(0i64);
128            [buffer, empty_buffer]
129        }
130        DataType::LargeListView(_) => [
131            MutableBuffer::new(capacity * mem::size_of::<i64>()),
132            MutableBuffer::new(capacity * mem::size_of::<i64>()),
133        ],
134        DataType::FixedSizeBinary(size) => {
135            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
136        }
137        DataType::Dictionary(k, _) => [
138            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
139            empty_buffer,
140        ],
141        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
142            [empty_buffer, MutableBuffer::new(0)]
143        }
144        DataType::Union(_, mode) => {
145            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
146            match mode {
147                UnionMode::Sparse => [type_ids, empty_buffer],
148                UnionMode::Dense => {
149                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
150                    [type_ids, offsets]
151                }
152            }
153        }
154    }
155}
156
157/// A generic representation of Arrow array data which encapsulates common attributes
158/// and operations for Arrow array.
159///
160/// Specific operations for different arrays types (e.g., primitive, list, struct)
161/// are implemented in `Array`.
162///
163/// # Memory Layout
164///
165/// `ArrayData` has references to one or more underlying data buffers
166/// and optional child ArrayData, depending on type as illustrated
167/// below. Bitmaps are not shown for simplicity but they are stored
168/// similarly to the buffers.
169///
170/// ```text
171///                        offset
172///                       points to
173/// ┌───────────────────┐ start of  ┌───────┐       Different
174/// │                   │   data    │       │     ArrayData may
175/// │ArrayData {        │           │....   │     also refers to
176/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
177/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
178/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
179/// │  buffers: [       │           │5882   │◀─
180/// │    ...            │  │        │4323   │
181/// │  ]                │   ─ ─ ─ ─▶│4859   │
182/// │  child_data: [    │           │....   │
183/// │    ...            │           │       │
184/// │  ]                │           └───────┘
185/// │}                  │
186/// │                   │            Shared Buffer uses
187/// │               │   │            bytes::Bytes to hold
188/// └───────────────────┘            actual data values
189///           ┌ ─ ─ ┘
190///
191///           ▼
192/// ┌───────────────────┐
193/// │ArrayData {        │
194/// │  ...              │
195/// │}                  │
196/// │                   │
197/// └───────────────────┘
198///
199/// Child ArrayData may also have its own buffers and children
200/// ```
201
202#[derive(Debug, Clone)]
203pub struct ArrayData {
204    /// The data type
205    data_type: DataType,
206
207    /// The number of elements
208    len: usize,
209
210    /// The offset in number of items (not bytes).
211    ///
212    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
213    /// does NOT apply to [`Self::nulls`].
214    offset: usize,
215
216    /// The buffers that store the actual data for this array, as defined
217    /// in the [Arrow Spec].
218    ///
219    /// Depending on the array types, [`Self::buffers`] can hold different
220    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
221    /// positions.
222    ///
223    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
224    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
225    /// we need to satisfy `len`.
226    ///
227    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
228    buffers: Vec<Buffer>,
229
230    /// The child(ren) of this array.
231    ///
232    /// Only non-empty for nested types, such as `ListArray` and
233    /// `StructArray`.
234    ///
235    /// The first logical element in each child element begins at `offset`.
236    ///
237    /// If the child element also has an offset then these offsets are
238    /// cumulative.
239    child_data: Vec<ArrayData>,
240
241    /// The null bitmap.
242    ///
243    /// `None` indicates all values are non-null in this array.
244    ///
245    /// [`Self::offset]` does not apply to the null bitmap. While the
246    /// BooleanBuffer may be sliced (have its own offset) internally, this
247    /// `NullBuffer` always represents exactly `len` elements.
248    nulls: Option<NullBuffer>,
249}
250
251/// A thread-safe, shared reference to the Arrow array data.
252pub type ArrayDataRef = Arc<ArrayData>;
253
254impl ArrayData {
255    /// Create a new ArrayData instance;
256    ///
257    /// If `null_count` is not specified, the number of nulls in
258    /// null_bit_buffer is calculated.
259    ///
260    /// If the number of nulls is 0 then the null_bit_buffer
261    /// is set to `None`.
262    ///
263    /// # Safety
264    ///
265    /// The input values *must* form a valid Arrow array for
266    /// `data_type`, or undefined behavior can result.
267    ///
268    /// Note: This is a low level API and most users of the arrow
269    /// crate should create arrays using the methods in the `array`
270    /// module.
271    pub unsafe fn new_unchecked(
272        data_type: DataType,
273        len: usize,
274        null_count: Option<usize>,
275        null_bit_buffer: Option<Buffer>,
276        offset: usize,
277        buffers: Vec<Buffer>,
278        child_data: Vec<ArrayData>,
279    ) -> Self {
280        let mut skip_validation = UnsafeFlag::new();
281        // SAFETY: caller responsible for ensuring data is valid
282        skip_validation.set(true);
283
284        ArrayDataBuilder {
285            data_type,
286            len,
287            null_count,
288            null_bit_buffer,
289            nulls: None,
290            offset,
291            buffers,
292            child_data,
293            align_buffers: false,
294            skip_validation,
295        }
296        .build()
297        .unwrap()
298    }
299
300    /// Create a new ArrayData, validating that the provided buffers form a valid
301    /// Arrow array of the specified data type.
302    ///
303    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
304    /// is set to `None`.
305    ///
306    /// Internally this calls through to [`Self::validate_data`]
307    ///
308    /// Note: This is a low level API and most users of the arrow crate should create
309    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
310    pub fn try_new(
311        data_type: DataType,
312        len: usize,
313        null_bit_buffer: Option<Buffer>,
314        offset: usize,
315        buffers: Vec<Buffer>,
316        child_data: Vec<ArrayData>,
317    ) -> Result<Self, ArrowError> {
318        // we must check the length of `null_bit_buffer` first
319        // because we use this buffer to calculate `null_count`
320        // in `Self::new_unchecked`.
321        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
322            let needed_len = bit_util::ceil(len + offset, 8);
323            if null_bit_buffer.len() < needed_len {
324                return Err(ArrowError::InvalidArgumentError(format!(
325                    "null_bit_buffer size too small. got {} needed {}",
326                    null_bit_buffer.len(),
327                    needed_len
328                )));
329            }
330        }
331        // Safety justification: `validate_full` is called below
332        let new_self = unsafe {
333            Self::new_unchecked(
334                data_type,
335                len,
336                None,
337                null_bit_buffer,
338                offset,
339                buffers,
340                child_data,
341            )
342        };
343
344        // As the data is not trusted, do a full validation of its contents
345        // We don't need to validate children as we can assume that the
346        // [`ArrayData`] in `child_data` have already been validated through
347        // a call to `ArrayData::try_new` or created using unsafe
348        new_self.validate_data()?;
349        Ok(new_self)
350    }
351
352    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
353    #[inline]
354    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
355        ArrayDataBuilder::new(data_type)
356    }
357
358    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
359    #[inline]
360    pub const fn data_type(&self) -> &DataType {
361        &self.data_type
362    }
363
364    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
365    pub fn buffers(&self) -> &[Buffer] {
366        &self.buffers
367    }
368
369    /// Returns a slice of children [`ArrayData`]. This will be non
370    /// empty for type such as lists and structs.
371    pub fn child_data(&self) -> &[ArrayData] {
372        &self.child_data[..]
373    }
374
375    /// Returns whether the element at index `i` is null
376    #[inline]
377    pub fn is_null(&self, i: usize) -> bool {
378        match &self.nulls {
379            Some(v) => v.is_null(i),
380            None => false,
381        }
382    }
383
384    /// Returns a reference to the null buffer of this [`ArrayData`] if any
385    ///
386    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
387    #[inline]
388    pub fn nulls(&self) -> Option<&NullBuffer> {
389        self.nulls.as_ref()
390    }
391
392    /// Returns whether the element at index `i` is not null
393    #[inline]
394    pub fn is_valid(&self, i: usize) -> bool {
395        !self.is_null(i)
396    }
397
398    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
399    #[inline]
400    pub const fn len(&self) -> usize {
401        self.len
402    }
403
404    /// Returns whether this [`ArrayData`] is empty
405    #[inline]
406    pub const fn is_empty(&self) -> bool {
407        self.len == 0
408    }
409
410    /// Returns the offset of this [`ArrayData`]
411    #[inline]
412    pub const fn offset(&self) -> usize {
413        self.offset
414    }
415
416    /// Returns the total number of nulls in this array
417    #[inline]
418    pub fn null_count(&self) -> usize {
419        self.nulls
420            .as_ref()
421            .map(|x| x.null_count())
422            .unwrap_or_default()
423    }
424
425    /// Returns the total number of bytes of memory occupied by the
426    /// buffers owned by this [`ArrayData`] and all of its
427    /// children. (See also diagram on [`ArrayData`]).
428    ///
429    /// Note that this [`ArrayData`] may only refer to a subset of the
430    /// data in the underlying [`Buffer`]s (due to `offset` and
431    /// `length`), but the size returned includes the entire size of
432    /// the buffers.
433    ///
434    /// If multiple [`ArrayData`]s refer to the same underlying
435    /// [`Buffer`]s they will both report the same size.
436    pub fn get_buffer_memory_size(&self) -> usize {
437        let mut size = 0;
438        for buffer in &self.buffers {
439            size += buffer.capacity();
440        }
441        if let Some(bitmap) = &self.nulls {
442            size += bitmap.buffer().capacity()
443        }
444        for child in &self.child_data {
445            size += child.get_buffer_memory_size();
446        }
447        size
448    }
449
450    /// Returns the total number of the bytes of memory occupied by
451    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
452    ///
453    /// This is approximately the number of bytes if a new
454    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
455    /// exactly the data needed.
456    ///
457    /// For example, a [`DataType::Int64`] with `100` elements,
458    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
459    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
460    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
461    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
462    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
463        let mut result: usize = 0;
464        let layout = layout(&self.data_type);
465
466        for spec in layout.buffers.iter() {
467            match spec {
468                BufferSpec::FixedWidth { byte_width, .. } => {
469                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
470                        ArrowError::ComputeError(
471                            "Integer overflow computing buffer size".to_string(),
472                        )
473                    })?;
474                    result += buffer_size;
475                }
476                BufferSpec::VariableWidth => {
477                    let buffer_len: usize;
478                    match self.data_type {
479                        DataType::Utf8 | DataType::Binary => {
480                            let offsets = self.typed_offsets::<i32>()?;
481                            buffer_len = (offsets[self.len] - offsets[0] ) as usize;
482                        }
483                        DataType::LargeUtf8 | DataType::LargeBinary => {
484                            let offsets = self.typed_offsets::<i64>()?;
485                            buffer_len = (offsets[self.len] - offsets[0]) as usize;
486                        }
487                        _ => {
488                            return Err(ArrowError::NotYetImplemented(format!(
489                            "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
490                            self.data_type
491                            )))
492                        }
493                    };
494                    result += buffer_len;
495                }
496                BufferSpec::BitMap => {
497                    let buffer_size = bit_util::ceil(self.len, 8);
498                    result += buffer_size;
499                }
500                BufferSpec::AlwaysNull => {
501                    // Nothing to do
502                }
503            }
504        }
505
506        if self.nulls().is_some() {
507            result += bit_util::ceil(self.len, 8);
508        }
509
510        for child in &self.child_data {
511            result += child.get_slice_memory_size()?;
512        }
513        Ok(result)
514    }
515
516    /// Returns the total number of bytes of memory occupied
517    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
518    /// children. (See also diagram on [`ArrayData`]).
519    ///
520    /// Equivalent to:
521    ///  `size_of_val(self)` +
522    ///  [`Self::get_buffer_memory_size`] +
523    ///  `size_of_val(child)` for all children
524    pub fn get_array_memory_size(&self) -> usize {
525        let mut size = mem::size_of_val(self);
526
527        // Calculate rest of the fields top down which contain actual data
528        for buffer in &self.buffers {
529            size += mem::size_of::<Buffer>();
530            size += buffer.capacity();
531        }
532        if let Some(nulls) = &self.nulls {
533            size += nulls.buffer().capacity();
534        }
535        for child in &self.child_data {
536            size += child.get_array_memory_size();
537        }
538
539        size
540    }
541
542    /// Creates a zero-copy slice of itself. This creates a new
543    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
544    /// different offset and len
545    ///
546    /// # Panics
547    ///
548    /// Panics if `offset + length > self.len()`.
549    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
550        assert!((offset + length) <= self.len());
551
552        if let DataType::Struct(_) = self.data_type() {
553            // Slice into children
554            let new_offset = self.offset + offset;
555            let new_data = ArrayData {
556                data_type: self.data_type().clone(),
557                len: length,
558                offset: new_offset,
559                buffers: self.buffers.clone(),
560                // Slice child data, to propagate offsets down to them
561                child_data: self
562                    .child_data()
563                    .iter()
564                    .map(|data| data.slice(offset, length))
565                    .collect(),
566                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
567            };
568
569            new_data
570        } else {
571            let mut new_data = self.clone();
572
573            new_data.len = length;
574            new_data.offset = offset + self.offset;
575            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
576
577            new_data
578        }
579    }
580
581    /// Returns the `buffer` as a slice of type `T` starting at self.offset
582    ///
583    /// # Panics
584    /// This function panics if:
585    /// * the buffer is not byte-aligned with type T, or
586    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
587    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
588        &self.buffers()[buffer].typed_data()[self.offset..]
589    }
590
591    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
592    pub fn new_null(data_type: &DataType, len: usize) -> Self {
593        let bit_len = bit_util::ceil(len, 8);
594        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
595
596        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
597            Some(width) => (vec![zeroed(width * len)], vec![], true),
598            None => match data_type {
599                DataType::Null => (vec![], vec![], false),
600                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
601                DataType::Binary | DataType::Utf8 => {
602                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
603                }
604                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
605                DataType::LargeBinary | DataType::LargeUtf8 => {
606                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
607                }
608                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
609                DataType::List(f) | DataType::Map(f, _) => (
610                    vec![zeroed((len + 1) * 4)],
611                    vec![ArrayData::new_empty(f.data_type())],
612                    true,
613                ),
614                DataType::LargeList(f) => (
615                    vec![zeroed((len + 1) * 8)],
616                    vec![ArrayData::new_empty(f.data_type())],
617                    true,
618                ),
619                DataType::FixedSizeList(f, list_len) => (
620                    vec![],
621                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
622                    true,
623                ),
624                DataType::Struct(fields) => (
625                    vec![],
626                    fields
627                        .iter()
628                        .map(|f| Self::new_null(f.data_type(), len))
629                        .collect(),
630                    true,
631                ),
632                DataType::Dictionary(k, v) => (
633                    vec![zeroed(k.primitive_width().unwrap() * len)],
634                    vec![ArrayData::new_empty(v.as_ref())],
635                    true,
636                ),
637                DataType::Union(f, mode) => {
638                    let (id, _) = f.iter().next().unwrap();
639                    let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
640                    let buffers = match mode {
641                        UnionMode::Sparse => vec![ids],
642                        UnionMode::Dense => {
643                            let end_offset = i32::from_usize(len).unwrap();
644                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
645                        }
646                    };
647
648                    let children = f
649                        .iter()
650                        .enumerate()
651                        .map(|(idx, (_, f))| {
652                            if idx == 0 || *mode == UnionMode::Sparse {
653                                Self::new_null(f.data_type(), len)
654                            } else {
655                                Self::new_empty(f.data_type())
656                            }
657                        })
658                        .collect();
659
660                    (buffers, children, false)
661                }
662                DataType::RunEndEncoded(r, v) => {
663                    let runs = match r.data_type() {
664                        DataType::Int16 => {
665                            let i = i16::from_usize(len).expect("run overflow");
666                            Buffer::from_slice_ref([i])
667                        }
668                        DataType::Int32 => {
669                            let i = i32::from_usize(len).expect("run overflow");
670                            Buffer::from_slice_ref([i])
671                        }
672                        DataType::Int64 => {
673                            let i = i64::from_usize(len).expect("run overflow");
674                            Buffer::from_slice_ref([i])
675                        }
676                        dt => unreachable!("Invalid run ends data type {dt}"),
677                    };
678
679                    let builder = ArrayData::builder(r.data_type().clone())
680                        .len(1)
681                        .buffers(vec![runs]);
682
683                    // SAFETY:
684                    // Valid by construction
685                    let runs = unsafe { builder.build_unchecked() };
686                    (
687                        vec![],
688                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
689                        false,
690                    )
691                }
692                d => unreachable!("{d}"),
693            },
694        };
695
696        let mut builder = ArrayDataBuilder::new(data_type.clone())
697            .len(len)
698            .buffers(buffers)
699            .child_data(child_data);
700
701        if has_nulls {
702            builder = builder.nulls(Some(NullBuffer::new_null(len)))
703        }
704
705        // SAFETY:
706        // Data valid by construction
707        unsafe { builder.build_unchecked() }
708    }
709
710    /// Returns a new empty [ArrayData] valid for `data_type`.
711    pub fn new_empty(data_type: &DataType) -> Self {
712        Self::new_null(data_type, 0)
713    }
714
715    /// Verifies that the buffers meet the minimum alignment requirements for the data type
716    ///
717    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
718    ///
719    /// This can be useful for when interacting with data sent over IPC or FFI, that may
720    /// not meet the minimum alignment requirements
721    ///
722    /// This also aligns buffers of children data
723    pub fn align_buffers(&mut self) {
724        let layout = layout(&self.data_type);
725        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
726            if let BufferSpec::FixedWidth { alignment, .. } = spec {
727                if buffer.as_ptr().align_offset(*alignment) != 0 {
728                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
729                }
730            }
731        }
732        // align children data recursively
733        for data in self.child_data.iter_mut() {
734            data.align_buffers()
735        }
736    }
737
738    /// "cheap" validation of an `ArrayData`. Ensures buffers are
739    /// sufficiently sized to store `len` + `offset` total elements of
740    /// `data_type` and performs other inexpensive consistency checks.
741    ///
742    /// This check is "cheap" in the sense that it does not validate the
743    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
744    /// are within the bounds of the values buffer).
745    ///
746    /// See [ArrayData::validate_data] to validate fully the offset content
747    /// and the validity of utf8 data
748    pub fn validate(&self) -> Result<(), ArrowError> {
749        // Need at least this mich space in each buffer
750        let len_plus_offset = self.len + self.offset;
751
752        // Check that the data layout conforms to the spec
753        let layout = layout(&self.data_type);
754
755        if !layout.can_contain_null_mask && self.nulls.is_some() {
756            return Err(ArrowError::InvalidArgumentError(format!(
757                "Arrays of type {:?} cannot contain a null bitmask",
758                self.data_type,
759            )));
760        }
761
762        // Check data buffers length for view types and other types
763        if self.buffers.len() < layout.buffers.len()
764            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
765        {
766            return Err(ArrowError::InvalidArgumentError(format!(
767                "Expected {} buffers in array of type {:?}, got {}",
768                layout.buffers.len(),
769                self.data_type,
770                self.buffers.len(),
771            )));
772        }
773
774        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
775            match spec {
776                BufferSpec::FixedWidth {
777                    byte_width,
778                    alignment,
779                } => {
780                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
781
782                    if buffer.len() < min_buffer_size {
783                        return Err(ArrowError::InvalidArgumentError(format!(
784                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
785                            min_buffer_size, i, self.data_type, buffer.len()
786                        )));
787                    }
788
789                    let align_offset = buffer.as_ptr().align_offset(*alignment);
790                    if align_offset != 0 {
791                        return Err(ArrowError::InvalidArgumentError(format!(
792                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
793                            self.data_type, align_offset.min(alignment - align_offset)
794                        )));
795                    }
796                }
797                BufferSpec::VariableWidth => {
798                    // not cheap to validate (need to look at the
799                    // data). Partially checked in validate_offsets
800                    // called below. Can check with `validate_full`
801                }
802                BufferSpec::BitMap => {
803                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
804                    if buffer.len() < min_buffer_size {
805                        return Err(ArrowError::InvalidArgumentError(format!(
806                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
807                            min_buffer_size, i, self.data_type, buffer.len()
808                        )));
809                    }
810                }
811                BufferSpec::AlwaysNull => {
812                    // Nothing to validate
813                }
814            }
815        }
816
817        // check null bit buffer size
818        if let Some(nulls) = self.nulls() {
819            if nulls.null_count() > self.len {
820                return Err(ArrowError::InvalidArgumentError(format!(
821                    "null_count {} for an array exceeds length of {} elements",
822                    nulls.null_count(),
823                    self.len
824                )));
825            }
826
827            let actual_len = nulls.validity().len();
828            let needed_len = bit_util::ceil(len_plus_offset, 8);
829            if actual_len < needed_len {
830                return Err(ArrowError::InvalidArgumentError(format!(
831                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
832                )));
833            }
834
835            if nulls.len() != self.len {
836                return Err(ArrowError::InvalidArgumentError(format!(
837                    "null buffer incorrect size. got {} expected {}",
838                    nulls.len(),
839                    self.len
840                )));
841            }
842        }
843
844        self.validate_child_data()?;
845
846        // Additional Type specific checks
847        match &self.data_type {
848            DataType::Utf8 | DataType::Binary => {
849                self.validate_offsets::<i32>(self.buffers[1].len())?;
850            }
851            DataType::LargeUtf8 | DataType::LargeBinary => {
852                self.validate_offsets::<i64>(self.buffers[1].len())?;
853            }
854            DataType::Dictionary(key_type, _value_type) => {
855                // At the moment, constructing a DictionaryArray will also check this
856                if !DataType::is_dictionary_key_type(key_type) {
857                    return Err(ArrowError::InvalidArgumentError(format!(
858                        "Dictionary key type must be integer, but was {key_type}"
859                    )));
860                }
861            }
862            DataType::RunEndEncoded(run_ends_type, _) => {
863                if run_ends_type.is_nullable() {
864                    return Err(ArrowError::InvalidArgumentError(
865                        "The nullable should be set to false for the field defining run_ends array.".to_string()
866                    ));
867                }
868                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
869                    return Err(ArrowError::InvalidArgumentError(format!(
870                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
871                        run_ends_type.data_type()
872                    )));
873                }
874            }
875            _ => {}
876        };
877
878        Ok(())
879    }
880
881    /// Returns a reference to the data in `buffer` as a typed slice
882    /// (typically `&[i32]` or `&[i64]`) after validating. The
883    /// returned slice is guaranteed to have at least `self.len + 1`
884    /// entries.
885    ///
886    /// For an empty array, the `buffer` can also be empty.
887    fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
888        // An empty list-like array can have 0 offsets
889        if self.len == 0 && self.buffers[0].is_empty() {
890            return Ok(&[]);
891        }
892
893        self.typed_buffer(0, self.len + 1)
894    }
895
896    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
897    fn typed_buffer<T: ArrowNativeType + num::Num>(
898        &self,
899        idx: usize,
900        len: usize,
901    ) -> Result<&[T], ArrowError> {
902        let buffer = &self.buffers[idx];
903
904        let required_len = (len + self.offset) * mem::size_of::<T>();
905
906        if buffer.len() < required_len {
907            return Err(ArrowError::InvalidArgumentError(format!(
908                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
909                idx,
910                self.data_type,
911                required_len,
912                buffer.len()
913            )));
914        }
915
916        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
917    }
918
919    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
920    /// offsets (of type T) into some other buffer of `values_length` bytes long
921    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
922        &self,
923        values_length: usize,
924    ) -> Result<(), ArrowError> {
925        // Justification: buffer size was validated above
926        let offsets = self.typed_offsets::<T>()?;
927        if offsets.is_empty() {
928            return Ok(());
929        }
930
931        let first_offset = offsets[0].to_usize().ok_or_else(|| {
932            ArrowError::InvalidArgumentError(format!(
933                "Error converting offset[0] ({}) to usize for {}",
934                offsets[0], self.data_type
935            ))
936        })?;
937
938        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
939            ArrowError::InvalidArgumentError(format!(
940                "Error converting offset[{}] ({}) to usize for {}",
941                self.len, offsets[self.len], self.data_type
942            ))
943        })?;
944
945        if first_offset > values_length {
946            return Err(ArrowError::InvalidArgumentError(format!(
947                "First offset {} of {} is larger than values length {}",
948                first_offset, self.data_type, values_length,
949            )));
950        }
951
952        if last_offset > values_length {
953            return Err(ArrowError::InvalidArgumentError(format!(
954                "Last offset {} of {} is larger than values length {}",
955                last_offset, self.data_type, values_length,
956            )));
957        }
958
959        if first_offset > last_offset {
960            return Err(ArrowError::InvalidArgumentError(format!(
961                "First offset {} in {} is smaller than last offset {}",
962                first_offset, self.data_type, last_offset,
963            )));
964        }
965
966        Ok(())
967    }
968
969    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
970    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
971    fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
972        &self,
973        values_length: usize,
974    ) -> Result<(), ArrowError> {
975        let offsets: &[T] = self.typed_buffer(0, self.len)?;
976        let sizes: &[T] = self.typed_buffer(1, self.len)?;
977        for i in 0..values_length {
978            let size = sizes[i].to_usize().ok_or_else(|| {
979                ArrowError::InvalidArgumentError(format!(
980                    "Error converting size[{}] ({}) to usize for {}",
981                    i, sizes[i], self.data_type
982                ))
983            })?;
984            let offset = offsets[i].to_usize().ok_or_else(|| {
985                ArrowError::InvalidArgumentError(format!(
986                    "Error converting offset[{}] ({}) to usize for {}",
987                    i, offsets[i], self.data_type
988                ))
989            })?;
990            if size
991                .checked_add(offset)
992                .expect("Offset and size have exceeded the usize boundary")
993                > values_length
994            {
995                return Err(ArrowError::InvalidArgumentError(format!(
996                    "Size {} at index {} is larger than the remaining values for {}",
997                    size, i, self.data_type
998                )));
999            }
1000        }
1001        Ok(())
1002    }
1003
1004    /// Validates the layout of `child_data` ArrayData structures
1005    fn validate_child_data(&self) -> Result<(), ArrowError> {
1006        match &self.data_type {
1007            DataType::List(field) | DataType::Map(field, _) => {
1008                let values_data = self.get_single_valid_child_data(field.data_type())?;
1009                self.validate_offsets::<i32>(values_data.len)?;
1010                Ok(())
1011            }
1012            DataType::LargeList(field) => {
1013                let values_data = self.get_single_valid_child_data(field.data_type())?;
1014                self.validate_offsets::<i64>(values_data.len)?;
1015                Ok(())
1016            }
1017            DataType::ListView(field) => {
1018                let values_data = self.get_single_valid_child_data(field.data_type())?;
1019                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1020                Ok(())
1021            }
1022            DataType::LargeListView(field) => {
1023                let values_data = self.get_single_valid_child_data(field.data_type())?;
1024                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1025                Ok(())
1026            }
1027            DataType::FixedSizeList(field, list_size) => {
1028                let values_data = self.get_single_valid_child_data(field.data_type())?;
1029
1030                let list_size: usize = (*list_size).try_into().map_err(|_| {
1031                    ArrowError::InvalidArgumentError(format!(
1032                        "{} has a negative list_size {}",
1033                        self.data_type, list_size
1034                    ))
1035                })?;
1036
1037                let expected_values_len = self.len
1038                    .checked_mul(list_size)
1039                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1040
1041                if values_data.len < expected_values_len {
1042                    return Err(ArrowError::InvalidArgumentError(format!(
1043                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1044                        values_data.len, self.len, list_size, self.data_type
1045                    )));
1046                }
1047
1048                Ok(())
1049            }
1050            DataType::Struct(fields) => {
1051                self.validate_num_child_data(fields.len())?;
1052                for (i, field) in fields.iter().enumerate() {
1053                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1054
1055                    // Ensure child field has sufficient size
1056                    if field_data.len < self.len {
1057                        return Err(ArrowError::InvalidArgumentError(format!(
1058                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1059                            self.data_type, i, field.name(), field_data.len, self.len
1060                        )));
1061                    }
1062                }
1063                Ok(())
1064            }
1065            DataType::RunEndEncoded(run_ends_field, values_field) => {
1066                self.validate_num_child_data(2)?;
1067                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1068                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1069                if run_ends_data.len != values_data.len {
1070                    return Err(ArrowError::InvalidArgumentError(format!(
1071                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1072                        run_ends_data.len, values_data.len
1073                    )));
1074                }
1075                if run_ends_data.nulls.is_some() {
1076                    return Err(ArrowError::InvalidArgumentError(
1077                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1078                    ));
1079                }
1080                Ok(())
1081            }
1082            DataType::Union(fields, mode) => {
1083                self.validate_num_child_data(fields.len())?;
1084
1085                for (i, (_, field)) in fields.iter().enumerate() {
1086                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1087
1088                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1089                        return Err(ArrowError::InvalidArgumentError(format!(
1090                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1091                            i, field_data.len, self.len + self.offset
1092                        )));
1093                    }
1094                }
1095                Ok(())
1096            }
1097            DataType::Dictionary(_key_type, value_type) => {
1098                self.get_single_valid_child_data(value_type)?;
1099                Ok(())
1100            }
1101            _ => {
1102                // other types do not have child data
1103                if !self.child_data.is_empty() {
1104                    return Err(ArrowError::InvalidArgumentError(format!(
1105                        "Expected no child arrays for type {} but got {}",
1106                        self.data_type,
1107                        self.child_data.len()
1108                    )));
1109                }
1110                Ok(())
1111            }
1112        }
1113    }
1114
1115    /// Ensures that this array data has a single child_data with the
1116    /// expected type, and calls `validate()` on it. Returns a
1117    /// reference to that child_data
1118    fn get_single_valid_child_data(
1119        &self,
1120        expected_type: &DataType,
1121    ) -> Result<&ArrayData, ArrowError> {
1122        self.validate_num_child_data(1)?;
1123        self.get_valid_child_data(0, expected_type)
1124    }
1125
1126    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1127    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1128        if self.child_data.len() != expected_len {
1129            Err(ArrowError::InvalidArgumentError(format!(
1130                "Value data for {} should contain {} child data array(s), had {}",
1131                self.data_type,
1132                expected_len,
1133                self.child_data.len()
1134            )))
1135        } else {
1136            Ok(())
1137        }
1138    }
1139
1140    /// Ensures that `child_data[i]` has the expected type, calls
1141    /// `validate()` on it, and returns a reference to that child_data
1142    fn get_valid_child_data(
1143        &self,
1144        i: usize,
1145        expected_type: &DataType,
1146    ) -> Result<&ArrayData, ArrowError> {
1147        let values_data = self.child_data.get(i).ok_or_else(|| {
1148            ArrowError::InvalidArgumentError(format!(
1149                "{} did not have enough child arrays. Expected at least {} but had only {}",
1150                self.data_type,
1151                i + 1,
1152                self.child_data.len()
1153            ))
1154        })?;
1155
1156        if expected_type != &values_data.data_type {
1157            return Err(ArrowError::InvalidArgumentError(format!(
1158                "Child type mismatch for {}. Expected {} but child data had {}",
1159                self.data_type, expected_type, values_data.data_type
1160            )));
1161        }
1162
1163        values_data.validate()?;
1164        Ok(values_data)
1165    }
1166
1167    /// Validate that the data contained within this [`ArrayData`] is valid
1168    ///
1169    /// 1. Null count is correct
1170    /// 2. All offsets are valid
1171    /// 3. All String data is valid UTF-8
1172    /// 4. All dictionary offsets are valid
1173    ///
1174    /// Internally this calls:
1175    ///
1176    /// * [`Self::validate`]
1177    /// * [`Self::validate_nulls`]
1178    /// * [`Self::validate_values`]
1179    ///
1180    /// Note: this does not recurse into children, for a recursive variant
1181    /// see [`Self::validate_full`]
1182    pub fn validate_data(&self) -> Result<(), ArrowError> {
1183        self.validate()?;
1184
1185        self.validate_nulls()?;
1186        self.validate_values()?;
1187        Ok(())
1188    }
1189
1190    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1191    ///
1192    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1193    /// and all its children recursively
1194    pub fn validate_full(&self) -> Result<(), ArrowError> {
1195        self.validate_data()?;
1196        // validate all children recursively
1197        self.child_data
1198            .iter()
1199            .enumerate()
1200            .try_for_each(|(i, child_data)| {
1201                child_data.validate_full().map_err(|e| {
1202                    ArrowError::InvalidArgumentError(format!(
1203                        "{} child #{} invalid: {}",
1204                        self.data_type, i, e
1205                    ))
1206                })
1207            })?;
1208        Ok(())
1209    }
1210
1211    /// Validates the values stored within this [`ArrayData`] are valid
1212    /// without recursing into child [`ArrayData`]
1213    ///
1214    /// Does not (yet) check
1215    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1216    /// 2. the the null count is correct and that any
1217    /// 3. nullability requirements of its children are correct
1218    ///
1219    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1220    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1221        if let Some(nulls) = &self.nulls {
1222            let actual = nulls.len() - nulls.inner().count_set_bits();
1223            if actual != nulls.null_count() {
1224                return Err(ArrowError::InvalidArgumentError(format!(
1225                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1226                    nulls.null_count(),
1227                    actual
1228                )));
1229            }
1230        }
1231
1232        // In general non-nullable children should not contain nulls, however, for certain
1233        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1234        // space in the child. As such we permit nulls in the children in the corresponding
1235        // positions for such types
1236        match &self.data_type {
1237            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1238                if !f.is_nullable() {
1239                    self.validate_non_nullable(None, &self.child_data[0])?
1240                }
1241            }
1242            DataType::FixedSizeList(field, len) => {
1243                let child = &self.child_data[0];
1244                if !field.is_nullable() {
1245                    match &self.nulls {
1246                        Some(nulls) => {
1247                            let element_len = *len as usize;
1248                            let expanded = nulls.expand(element_len);
1249                            self.validate_non_nullable(Some(&expanded), child)?;
1250                        }
1251                        None => self.validate_non_nullable(None, child)?,
1252                    }
1253                }
1254            }
1255            DataType::Struct(fields) => {
1256                for (field, child) in fields.iter().zip(&self.child_data) {
1257                    if !field.is_nullable() {
1258                        self.validate_non_nullable(self.nulls(), child)?
1259                    }
1260                }
1261            }
1262            _ => {}
1263        }
1264
1265        Ok(())
1266    }
1267
1268    /// Verifies that `child` contains no nulls not present in `mask`
1269    fn validate_non_nullable(
1270        &self,
1271        mask: Option<&NullBuffer>,
1272        child: &ArrayData,
1273    ) -> Result<(), ArrowError> {
1274        let mask = match mask {
1275            Some(mask) => mask,
1276            None => {
1277                return match child.null_count() {
1278                    0 => Ok(()),
1279                    _ => Err(ArrowError::InvalidArgumentError(format!(
1280                        "non-nullable child of type {} contains nulls not present in parent {}",
1281                        child.data_type, self.data_type
1282                    ))),
1283                }
1284            }
1285        };
1286
1287        match child.nulls() {
1288            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1289                "non-nullable child of type {} contains nulls not present in parent",
1290                child.data_type
1291            ))),
1292            _ => Ok(()),
1293        }
1294    }
1295
1296    /// Validates the values stored within this [`ArrayData`] are valid
1297    /// without recursing into child [`ArrayData`]
1298    ///
1299    /// Does not (yet) check
1300    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1301    pub fn validate_values(&self) -> Result<(), ArrowError> {
1302        match &self.data_type {
1303            DataType::Utf8 => self.validate_utf8::<i32>(),
1304            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1305            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1306            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1307            DataType::BinaryView => {
1308                let views = self.typed_buffer::<u128>(0, self.len)?;
1309                validate_binary_view(views, &self.buffers[1..])
1310            }
1311            DataType::Utf8View => {
1312                let views = self.typed_buffer::<u128>(0, self.len)?;
1313                validate_string_view(views, &self.buffers[1..])
1314            }
1315            DataType::List(_) | DataType::Map(_, _) => {
1316                let child = &self.child_data[0];
1317                self.validate_offsets_full::<i32>(child.len)
1318            }
1319            DataType::LargeList(_) => {
1320                let child = &self.child_data[0];
1321                self.validate_offsets_full::<i64>(child.len)
1322            }
1323            DataType::Union(_, _) => {
1324                // Validate Union Array as part of implementing new Union semantics
1325                // See comments in `ArrayData::validate()`
1326                // https://github.com/apache/arrow-rs/issues/85
1327                //
1328                // TODO file follow on ticket for full union validation
1329                Ok(())
1330            }
1331            DataType::Dictionary(key_type, _value_type) => {
1332                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1333                let max_value = dictionary_length - 1;
1334                match key_type.as_ref() {
1335                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1336                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1337                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1338                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1339                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1340                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1341                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1342                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1343                    _ => unreachable!(),
1344                }
1345            }
1346            DataType::RunEndEncoded(run_ends, _values) => {
1347                let run_ends_data = self.child_data()[0].clone();
1348                match run_ends.data_type() {
1349                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1350                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1351                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1352                    _ => unreachable!(),
1353                }
1354            }
1355            _ => {
1356                // No extra validation check required for other types
1357                Ok(())
1358            }
1359        }
1360    }
1361
1362    /// Calls the `validate(item_index, range)` function for each of
1363    /// the ranges specified in the arrow offsets buffer of type
1364    /// `T`. Also validates that each offset is smaller than
1365    /// `offset_limit`
1366    ///
1367    /// For an empty array, the offsets buffer can either be empty
1368    /// or contain a single `0`.
1369    ///
1370    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1371    /// function would call `validate([1,2])`, and `validate([2,4])`
1372    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1373    where
1374        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1375        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1376    {
1377        self.typed_offsets::<T>()?
1378            .iter()
1379            .enumerate()
1380            .map(|(i, x)| {
1381                // check if the offset can be converted to usize
1382                let r = x.to_usize().ok_or_else(|| {
1383                    ArrowError::InvalidArgumentError(format!(
1384                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1385                    );
1386                // check if the offset exceeds the limit
1387                match r {
1388                    Ok(n) if n <= offset_limit => Ok((i, n)),
1389                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1390                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1391                    ),
1392                    Err(e) => Err(e),
1393                }
1394            })
1395            .scan(0_usize, |start, end| {
1396                // check offsets are monotonically increasing
1397                match end {
1398                    Ok((i, end)) if *start <= end => {
1399                        let range = Some(Ok((i, *start..end)));
1400                        *start = end;
1401                        range
1402                    }
1403                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1404                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1405                        i - 1, start, end))
1406                    )),
1407                    Err(err) => Some(Err(err)),
1408                }
1409            })
1410            .skip(1) // the first element is meaningless
1411            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1412                let (item_index, range) = res?;
1413                validate(item_index-1, range)
1414            })
1415    }
1416
1417    /// Ensures that all strings formed by the offsets in `buffers[0]`
1418    /// into `buffers[1]` are valid utf8 sequences
1419    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1420    where
1421        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1422    {
1423        let values_buffer = &self.buffers[1].as_slice();
1424        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1425            // Validate Offsets are correct
1426            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1427                if !values_str.is_char_boundary(range.start)
1428                    || !values_str.is_char_boundary(range.end)
1429                {
1430                    return Err(ArrowError::InvalidArgumentError(format!(
1431                        "incomplete utf-8 byte sequence from index {string_index}"
1432                    )));
1433                }
1434                Ok(())
1435            })
1436        } else {
1437            // find specific offset that failed utf8 validation
1438            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1439                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1440                    ArrowError::InvalidArgumentError(format!(
1441                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1442                    ))
1443                })?;
1444                Ok(())
1445            })
1446        }
1447    }
1448
1449    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1450    /// between `0` and `offset_limit`
1451    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1452    where
1453        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1454    {
1455        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1456            // No validation applied to each value, but the iteration
1457            // itself applies bounds checking to each range
1458            Ok(())
1459        })
1460    }
1461
1462    /// Validates that each value in self.buffers (typed as T)
1463    /// is within the range [0, max_value], inclusive
1464    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1465    where
1466        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1467    {
1468        let required_len = self.len + self.offset;
1469        let buffer = &self.buffers[0];
1470
1471        // This should have been checked as part of `validate()` prior
1472        // to calling `validate_full()` but double check to be sure
1473        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1474
1475        // Justification: buffer size was validated above
1476        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1477
1478        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1479            // Do not check the value is null (value can be arbitrary)
1480            if self.is_null(i) {
1481                return Ok(());
1482            }
1483            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1484                ArrowError::InvalidArgumentError(format!(
1485                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1486                ))
1487            })?;
1488
1489            if dict_index < 0 || dict_index > max_value {
1490                return Err(ArrowError::InvalidArgumentError(format!(
1491                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1492                )));
1493            }
1494            Ok(())
1495        })
1496    }
1497
1498    /// Validates that each value in run_ends array is positive and strictly increasing.
1499    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1500    where
1501        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1502    {
1503        let values = self.typed_buffer::<T>(0, self.len)?;
1504        let mut prev_value: i64 = 0_i64;
1505        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1506            let value: i64 = inp_value.try_into().map_err(|_| {
1507                ArrowError::InvalidArgumentError(format!(
1508                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1509                ))
1510            })?;
1511            if value <= 0_i64 {
1512                return Err(ArrowError::InvalidArgumentError(format!(
1513                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1514                )));
1515            }
1516            if ix > 0 && value <= prev_value {
1517                return Err(ArrowError::InvalidArgumentError(format!(
1518                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1519                )));
1520            }
1521
1522            prev_value = value;
1523            Ok(())
1524        })?;
1525
1526        if prev_value.as_usize() < (self.offset + self.len) {
1527            return Err(ArrowError::InvalidArgumentError(format!(
1528                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1529                self.offset + self.len
1530            )));
1531        }
1532        Ok(())
1533    }
1534
1535    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1536    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1537    /// return false when the arrays are logically equal
1538    pub fn ptr_eq(&self, other: &Self) -> bool {
1539        if self.offset != other.offset
1540            || self.len != other.len
1541            || self.data_type != other.data_type
1542            || self.buffers.len() != other.buffers.len()
1543            || self.child_data.len() != other.child_data.len()
1544        {
1545            return false;
1546        }
1547
1548        match (&self.nulls, &other.nulls) {
1549            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1550            (Some(_), None) | (None, Some(_)) => return false,
1551            _ => {}
1552        };
1553
1554        if !self
1555            .buffers
1556            .iter()
1557            .zip(other.buffers.iter())
1558            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1559        {
1560            return false;
1561        }
1562
1563        self.child_data
1564            .iter()
1565            .zip(other.child_data.iter())
1566            .all(|(a, b)| a.ptr_eq(b))
1567    }
1568
1569    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1570    pub fn into_builder(self) -> ArrayDataBuilder {
1571        self.into()
1572    }
1573}
1574
1575/// Return the expected [`DataTypeLayout`] Arrays of this data
1576/// type are expected to have
1577pub fn layout(data_type: &DataType) -> DataTypeLayout {
1578    // based on C/C++ implementation in
1579    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1580    use arrow_schema::IntervalUnit::*;
1581
1582    match data_type {
1583        DataType::Null => DataTypeLayout {
1584            buffers: vec![],
1585            can_contain_null_mask: false,
1586            variadic: false,
1587        },
1588        DataType::Boolean => DataTypeLayout {
1589            buffers: vec![BufferSpec::BitMap],
1590            can_contain_null_mask: true,
1591            variadic: false,
1592        },
1593        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1594        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1595        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1596        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1597        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1598        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1599        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1600        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1601        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1602        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1603        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1604        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1605        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1606        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1607        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1608        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1609        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1610        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1611        DataType::Interval(MonthDayNano) => {
1612            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1613        }
1614        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1615        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1616        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1617        DataType::FixedSizeBinary(size) => {
1618            let spec = BufferSpec::FixedWidth {
1619                byte_width: (*size).try_into().unwrap(),
1620                alignment: mem::align_of::<u8>(),
1621            };
1622            DataTypeLayout {
1623                buffers: vec![spec],
1624                can_contain_null_mask: true,
1625                variadic: false,
1626            }
1627        }
1628        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1629        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1630        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1631        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1632        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1633        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1634        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1635        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1636        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1637        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1638        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1639        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1640        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1641        DataType::Union(_, mode) => {
1642            let type_ids = BufferSpec::FixedWidth {
1643                byte_width: mem::size_of::<i8>(),
1644                alignment: mem::align_of::<i8>(),
1645            };
1646
1647            DataTypeLayout {
1648                buffers: match mode {
1649                    UnionMode::Sparse => {
1650                        vec![type_ids]
1651                    }
1652                    UnionMode::Dense => {
1653                        vec![
1654                            type_ids,
1655                            BufferSpec::FixedWidth {
1656                                byte_width: mem::size_of::<i32>(),
1657                                alignment: mem::align_of::<i32>(),
1658                            },
1659                        ]
1660                    }
1661                },
1662                can_contain_null_mask: false,
1663                variadic: false,
1664            }
1665        }
1666        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1667    }
1668}
1669
1670/// Layout specification for a data type
1671#[derive(Debug, PartialEq, Eq)]
1672// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1673pub struct DataTypeLayout {
1674    /// A vector of buffer layout specifications, one for each expected buffer
1675    pub buffers: Vec<BufferSpec>,
1676
1677    /// Can contain a null bitmask
1678    pub can_contain_null_mask: bool,
1679
1680    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1681    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1682    /// buffers.len(). Buffers that exceed the lower bound are legal.
1683    pub variadic: bool,
1684}
1685
1686impl DataTypeLayout {
1687    /// Describes a basic numeric array where each element has type `T`
1688    pub fn new_fixed_width<T>() -> Self {
1689        Self {
1690            buffers: vec![BufferSpec::FixedWidth {
1691                byte_width: mem::size_of::<T>(),
1692                alignment: mem::align_of::<T>(),
1693            }],
1694            can_contain_null_mask: true,
1695            variadic: false,
1696        }
1697    }
1698
1699    /// Describes arrays which have no data of their own
1700    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1701    pub fn new_nullable_empty() -> Self {
1702        Self {
1703            buffers: vec![],
1704            can_contain_null_mask: true,
1705            variadic: false,
1706        }
1707    }
1708
1709    /// Describes arrays which have no data of their own
1710    /// (e.g. RunEndEncoded).
1711    pub fn new_empty() -> Self {
1712        Self {
1713            buffers: vec![],
1714            can_contain_null_mask: false,
1715            variadic: false,
1716        }
1717    }
1718
1719    /// Describes a basic numeric array where each element has a fixed
1720    /// with offset buffer of type `T`, followed by a
1721    /// variable width data buffer
1722    pub fn new_binary<T>() -> Self {
1723        Self {
1724            buffers: vec![
1725                // offsets
1726                BufferSpec::FixedWidth {
1727                    byte_width: mem::size_of::<T>(),
1728                    alignment: mem::align_of::<T>(),
1729                },
1730                // values
1731                BufferSpec::VariableWidth,
1732            ],
1733            can_contain_null_mask: true,
1734            variadic: false,
1735        }
1736    }
1737
1738    /// Describes a view type
1739    pub fn new_view() -> Self {
1740        Self {
1741            buffers: vec![BufferSpec::FixedWidth {
1742                byte_width: mem::size_of::<u128>(),
1743                alignment: mem::align_of::<u128>(),
1744            }],
1745            can_contain_null_mask: true,
1746            variadic: true,
1747        }
1748    }
1749
1750    /// Describes a list view type
1751    pub fn new_list_view<T>() -> Self {
1752        Self {
1753            buffers: vec![
1754                BufferSpec::FixedWidth {
1755                    byte_width: mem::size_of::<T>(),
1756                    alignment: mem::align_of::<T>(),
1757                },
1758                BufferSpec::FixedWidth {
1759                    byte_width: mem::size_of::<T>(),
1760                    alignment: mem::align_of::<T>(),
1761                },
1762            ],
1763            can_contain_null_mask: true,
1764            variadic: true,
1765        }
1766    }
1767}
1768
1769/// Layout specification for a single data type buffer
1770#[derive(Debug, PartialEq, Eq)]
1771pub enum BufferSpec {
1772    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1773    ///
1774    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1775    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1776    ///
1777    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1778    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1779    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1780    ///
1781    /// Note that these alignment requirements will vary between architectures
1782    FixedWidth {
1783        /// The width of each element in bytes
1784        byte_width: usize,
1785        /// The alignment required by Rust for an array of the corresponding primitive
1786        alignment: usize,
1787    },
1788    /// Variable width, such as string data for utf8 data
1789    VariableWidth,
1790    /// Buffer holds a bitmap.
1791    ///
1792    /// Note: Unlike the C++ implementation, the null/validity buffer
1793    /// is handled specially rather than as another of the buffers in
1794    /// the spec, so this variant is only used for the Boolean type.
1795    BitMap,
1796    /// Buffer is always null. Unused currently in Rust implementation,
1797    /// (used in C++ for Union type)
1798    #[allow(dead_code)]
1799    AlwaysNull,
1800}
1801
1802impl PartialEq for ArrayData {
1803    fn eq(&self, other: &Self) -> bool {
1804        equal::equal(self, other)
1805    }
1806}
1807
1808/// A boolean flag that cannot be mutated outside of unsafe code.
1809///
1810/// Defaults to a value of false.
1811///
1812/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1813///
1814/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1815///
1816/// # Example
1817/// ```rust
1818/// use arrow_data::UnsafeFlag;
1819/// assert!(!UnsafeFlag::default().get()); // default is false
1820/// let mut flag = UnsafeFlag::new();
1821/// assert!(!flag.get()); // defaults to false
1822/// // can only set it to true in unsafe code
1823/// unsafe { flag.set(true) };
1824/// assert!(flag.get()); // now true
1825/// ```
1826#[derive(Debug, Clone)]
1827#[doc(hidden)]
1828pub struct UnsafeFlag(bool);
1829
1830impl UnsafeFlag {
1831    /// Creates a new `UnsafeFlag` with the value set to `false`.
1832    ///
1833    /// See examples on [`Self::new`]
1834    #[inline]
1835    pub const fn new() -> Self {
1836        Self(false)
1837    }
1838
1839    /// Sets the value of the flag to the given value
1840    ///
1841    /// Note this can purposely only be done in `unsafe` code
1842    ///
1843    /// # Safety
1844    ///
1845    /// If set, the flag will be set to the given value. There is nothing
1846    /// immediately unsafe about doing so, however, the flag can be used to
1847    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1848    #[inline]
1849    pub unsafe fn set(&mut self, val: bool) {
1850        self.0 = val;
1851    }
1852
1853    /// Returns the value of the flag
1854    #[inline]
1855    pub fn get(&self) -> bool {
1856        self.0
1857    }
1858}
1859
1860// Manual impl to make it clear you can not construct unsafe with true
1861impl Default for UnsafeFlag {
1862    fn default() -> Self {
1863        Self::new()
1864    }
1865}
1866
1867/// Builder for [`ArrayData`] type
1868#[derive(Debug)]
1869pub struct ArrayDataBuilder {
1870    data_type: DataType,
1871    len: usize,
1872    null_count: Option<usize>,
1873    null_bit_buffer: Option<Buffer>,
1874    nulls: Option<NullBuffer>,
1875    offset: usize,
1876    buffers: Vec<Buffer>,
1877    child_data: Vec<ArrayData>,
1878    /// Should buffers be realigned (copying if necessary)?
1879    ///
1880    /// Defaults to false.
1881    align_buffers: bool,
1882    /// Should data validation be skipped for this [`ArrayData`]?
1883    ///
1884    /// Defaults to false.
1885    ///
1886    /// # Safety
1887    ///
1888    /// This flag can only be set to true using `unsafe` APIs. However, once true
1889    /// subsequent calls to `build()` may result in undefined behavior if the data
1890    /// is not valid.
1891    skip_validation: UnsafeFlag,
1892}
1893
1894impl ArrayDataBuilder {
1895    #[inline]
1896    /// Creates a new array data builder
1897    pub const fn new(data_type: DataType) -> Self {
1898        Self {
1899            data_type,
1900            len: 0,
1901            null_count: None,
1902            null_bit_buffer: None,
1903            nulls: None,
1904            offset: 0,
1905            buffers: vec![],
1906            child_data: vec![],
1907            align_buffers: false,
1908            skip_validation: UnsafeFlag::new(),
1909        }
1910    }
1911
1912    /// Creates a new array data builder from an existing one, changing the data type
1913    pub fn data_type(self, data_type: DataType) -> Self {
1914        Self { data_type, ..self }
1915    }
1916
1917    #[inline]
1918    #[allow(clippy::len_without_is_empty)]
1919    /// Sets the length of the [ArrayData]
1920    pub const fn len(mut self, n: usize) -> Self {
1921        self.len = n;
1922        self
1923    }
1924
1925    /// Sets the null buffer of the [ArrayData]
1926    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1927        self.nulls = nulls;
1928        self.null_count = None;
1929        self.null_bit_buffer = None;
1930        self
1931    }
1932
1933    /// Sets the null count of the [ArrayData]
1934    pub fn null_count(mut self, null_count: usize) -> Self {
1935        self.null_count = Some(null_count);
1936        self
1937    }
1938
1939    /// Sets the `null_bit_buffer` of the [ArrayData]
1940    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1941        self.nulls = None;
1942        self.null_bit_buffer = buf;
1943        self
1944    }
1945
1946    /// Sets the offset of the [ArrayData]
1947    #[inline]
1948    pub const fn offset(mut self, n: usize) -> Self {
1949        self.offset = n;
1950        self
1951    }
1952
1953    /// Sets the buffers of the [ArrayData]
1954    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1955        self.buffers = v;
1956        self
1957    }
1958
1959    /// Adds a single buffer to the [ArrayData]'s buffers
1960    pub fn add_buffer(mut self, b: Buffer) -> Self {
1961        self.buffers.push(b);
1962        self
1963    }
1964
1965    /// Adds multiple buffers to the [ArrayData]'s buffers
1966    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1967        self.buffers.extend(bs);
1968        self
1969    }
1970
1971    /// Sets the child data of the [ArrayData]
1972    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1973        self.child_data = v;
1974        self
1975    }
1976
1977    /// Adds a single child data to the [ArrayData]'s child data
1978    pub fn add_child_data(mut self, r: ArrayData) -> Self {
1979        self.child_data.push(r);
1980        self
1981    }
1982
1983    /// Creates an array data, without any validation
1984    ///
1985    /// Note: This is shorthand for
1986    /// ```rust
1987    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
1988    /// # let _ = unsafe {
1989    /// builder.skip_validation(true).build().unwrap()
1990    /// # };
1991    /// ```
1992    ///
1993    /// # Safety
1994    ///
1995    /// The same caveats as [`ArrayData::new_unchecked`]
1996    /// apply.
1997    pub unsafe fn build_unchecked(self) -> ArrayData {
1998        self.skip_validation(true).build().unwrap()
1999    }
2000
2001    /// Creates an `ArrayData`, consuming `self`
2002    ///
2003    /// # Safety
2004    ///
2005    /// By default the underlying buffers are checked to ensure they are valid
2006    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2007    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2008    /// not valid, undefined behavior will result.
2009    pub fn build(self) -> Result<ArrayData, ArrowError> {
2010        let Self {
2011            data_type,
2012            len,
2013            null_count,
2014            null_bit_buffer,
2015            nulls,
2016            offset,
2017            buffers,
2018            child_data,
2019            align_buffers,
2020            skip_validation,
2021        } = self;
2022
2023        let nulls = nulls
2024            .or_else(|| {
2025                let buffer = null_bit_buffer?;
2026                let buffer = BooleanBuffer::new(buffer, offset, len);
2027                Some(match null_count {
2028                    Some(n) => {
2029                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2030                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2031                    }
2032                    None => NullBuffer::new(buffer),
2033                })
2034            })
2035            .filter(|b| b.null_count() != 0);
2036
2037        let mut data = ArrayData {
2038            data_type,
2039            len,
2040            offset,
2041            buffers,
2042            child_data,
2043            nulls,
2044        };
2045
2046        if align_buffers {
2047            data.align_buffers();
2048        }
2049
2050        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2051        if !skip_validation.get() || cfg!(feature = "force_validate") {
2052            data.validate_data()?;
2053        }
2054        Ok(data)
2055    }
2056
2057    /// Creates an array data, validating all inputs, and aligning any buffers
2058    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2059    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2060        self.align_buffers(true).build()
2061    }
2062
2063    /// Ensure that all buffers are aligned, copying data if necessary
2064    ///
2065    /// Rust requires that arrays are aligned to their corresponding primitive,
2066    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2067    ///
2068    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2069    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2070    ///
2071    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2072    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2073    /// when necessary, making it useful when interacting with buffers produced by other systems,
2074    /// e.g. IPC or FFI.
2075    ///
2076    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2077    /// insufficiently aligned buffers.
2078    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2079        self.align_buffers = align_buffers;
2080        self
2081    }
2082
2083    /// Skips validation of the data.
2084    ///
2085    /// If this flag is enabled, `[Self::build`] will skip validation of the
2086    /// data
2087    ///
2088    /// If this flag is not enabled, `[Self::build`] will validate that all
2089    /// buffers are valid and will return an error if any data is invalid.
2090    /// Validation can be expensive.
2091    ///
2092    /// # Safety
2093    ///
2094    /// If validation is skipped, the buffers must form a valid Arrow array,
2095    /// otherwise undefined behavior will result
2096    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2097        self.skip_validation.set(skip_validation);
2098        self
2099    }
2100}
2101
2102impl From<ArrayData> for ArrayDataBuilder {
2103    fn from(d: ArrayData) -> Self {
2104        Self {
2105            data_type: d.data_type,
2106            len: d.len,
2107            offset: d.offset,
2108            buffers: d.buffers,
2109            child_data: d.child_data,
2110            nulls: d.nulls,
2111            null_bit_buffer: None,
2112            null_count: None,
2113            align_buffers: false,
2114            skip_validation: UnsafeFlag::new(),
2115        }
2116    }
2117}
2118
2119#[cfg(test)]
2120mod tests {
2121    use super::*;
2122    use arrow_schema::{Field, Fields};
2123
2124    // See arrow/tests/array_data_validation.rs for test of array validation
2125
2126    /// returns a buffer initialized with some constant value for tests
2127    fn make_i32_buffer(n: usize) -> Buffer {
2128        Buffer::from_slice_ref(vec![42i32; n])
2129    }
2130
2131    /// returns a buffer initialized with some constant value for tests
2132    fn make_f32_buffer(n: usize) -> Buffer {
2133        Buffer::from_slice_ref(vec![42f32; n])
2134    }
2135
2136    #[test]
2137    fn test_builder() {
2138        // Buffer needs to be at least 25 long
2139        let v = (0..25).collect::<Vec<i32>>();
2140        let b1 = Buffer::from_slice_ref(&v);
2141        let arr_data = ArrayData::builder(DataType::Int32)
2142            .len(20)
2143            .offset(5)
2144            .add_buffer(b1)
2145            .null_bit_buffer(Some(Buffer::from([
2146                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2147            ])))
2148            .build()
2149            .unwrap();
2150
2151        assert_eq!(20, arr_data.len());
2152        assert_eq!(10, arr_data.null_count());
2153        assert_eq!(5, arr_data.offset());
2154        assert_eq!(1, arr_data.buffers().len());
2155        assert_eq!(
2156            Buffer::from_slice_ref(&v).as_slice(),
2157            arr_data.buffers()[0].as_slice()
2158        );
2159    }
2160
2161    #[test]
2162    fn test_builder_with_child_data() {
2163        let child_arr_data = ArrayData::try_new(
2164            DataType::Int32,
2165            5,
2166            None,
2167            0,
2168            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2169            vec![],
2170        )
2171        .unwrap();
2172
2173        let field = Arc::new(Field::new("x", DataType::Int32, true));
2174        let data_type = DataType::Struct(vec![field].into());
2175
2176        let arr_data = ArrayData::builder(data_type)
2177            .len(5)
2178            .offset(0)
2179            .add_child_data(child_arr_data.clone())
2180            .build()
2181            .unwrap();
2182
2183        assert_eq!(5, arr_data.len());
2184        assert_eq!(1, arr_data.child_data().len());
2185        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2186    }
2187
2188    #[test]
2189    fn test_null_count() {
2190        let mut bit_v: [u8; 2] = [0; 2];
2191        bit_util::set_bit(&mut bit_v, 0);
2192        bit_util::set_bit(&mut bit_v, 3);
2193        bit_util::set_bit(&mut bit_v, 10);
2194        let arr_data = ArrayData::builder(DataType::Int32)
2195            .len(16)
2196            .add_buffer(make_i32_buffer(16))
2197            .null_bit_buffer(Some(Buffer::from(bit_v)))
2198            .build()
2199            .unwrap();
2200        assert_eq!(13, arr_data.null_count());
2201
2202        // Test with offset
2203        let mut bit_v: [u8; 2] = [0; 2];
2204        bit_util::set_bit(&mut bit_v, 0);
2205        bit_util::set_bit(&mut bit_v, 3);
2206        bit_util::set_bit(&mut bit_v, 10);
2207        let arr_data = ArrayData::builder(DataType::Int32)
2208            .len(12)
2209            .offset(2)
2210            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2211            .null_bit_buffer(Some(Buffer::from(bit_v)))
2212            .build()
2213            .unwrap();
2214        assert_eq!(10, arr_data.null_count());
2215    }
2216
2217    #[test]
2218    fn test_null_buffer_ref() {
2219        let mut bit_v: [u8; 2] = [0; 2];
2220        bit_util::set_bit(&mut bit_v, 0);
2221        bit_util::set_bit(&mut bit_v, 3);
2222        bit_util::set_bit(&mut bit_v, 10);
2223        let arr_data = ArrayData::builder(DataType::Int32)
2224            .len(16)
2225            .add_buffer(make_i32_buffer(16))
2226            .null_bit_buffer(Some(Buffer::from(bit_v)))
2227            .build()
2228            .unwrap();
2229        assert!(arr_data.nulls().is_some());
2230        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2231    }
2232
2233    #[test]
2234    fn test_slice() {
2235        let mut bit_v: [u8; 2] = [0; 2];
2236        bit_util::set_bit(&mut bit_v, 0);
2237        bit_util::set_bit(&mut bit_v, 3);
2238        bit_util::set_bit(&mut bit_v, 10);
2239        let data = ArrayData::builder(DataType::Int32)
2240            .len(16)
2241            .add_buffer(make_i32_buffer(16))
2242            .null_bit_buffer(Some(Buffer::from(bit_v)))
2243            .build()
2244            .unwrap();
2245        let new_data = data.slice(1, 15);
2246        assert_eq!(data.len() - 1, new_data.len());
2247        assert_eq!(1, new_data.offset());
2248        assert_eq!(data.null_count(), new_data.null_count());
2249
2250        // slice of a slice (removes one null)
2251        let new_data = new_data.slice(1, 14);
2252        assert_eq!(data.len() - 2, new_data.len());
2253        assert_eq!(2, new_data.offset());
2254        assert_eq!(data.null_count() - 1, new_data.null_count());
2255    }
2256
2257    #[test]
2258    fn test_equality() {
2259        let int_data = ArrayData::builder(DataType::Int32)
2260            .len(1)
2261            .add_buffer(make_i32_buffer(1))
2262            .build()
2263            .unwrap();
2264
2265        let float_data = ArrayData::builder(DataType::Float32)
2266            .len(1)
2267            .add_buffer(make_f32_buffer(1))
2268            .build()
2269            .unwrap();
2270        assert_ne!(int_data, float_data);
2271        assert!(!int_data.ptr_eq(&float_data));
2272        assert!(int_data.ptr_eq(&int_data));
2273
2274        #[allow(clippy::redundant_clone)]
2275        let int_data_clone = int_data.clone();
2276        assert_eq!(int_data, int_data_clone);
2277        assert!(int_data.ptr_eq(&int_data_clone));
2278        assert!(int_data_clone.ptr_eq(&int_data));
2279
2280        let int_data_slice = int_data_clone.slice(1, 0);
2281        assert!(int_data_slice.ptr_eq(&int_data_slice));
2282        assert!(!int_data.ptr_eq(&int_data_slice));
2283        assert!(!int_data_slice.ptr_eq(&int_data));
2284
2285        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2286        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2287        let string_data = ArrayData::try_new(
2288            DataType::Utf8,
2289            3,
2290            Some(Buffer::from_iter(vec![true, false, true])),
2291            0,
2292            vec![offsets_buffer, data_buffer],
2293            vec![],
2294        )
2295        .unwrap();
2296
2297        assert_ne!(float_data, string_data);
2298        assert!(!float_data.ptr_eq(&string_data));
2299
2300        assert!(string_data.ptr_eq(&string_data));
2301
2302        #[allow(clippy::redundant_clone)]
2303        let string_data_cloned = string_data.clone();
2304        assert!(string_data_cloned.ptr_eq(&string_data));
2305        assert!(string_data.ptr_eq(&string_data_cloned));
2306
2307        let string_data_slice = string_data.slice(1, 2);
2308        assert!(string_data_slice.ptr_eq(&string_data_slice));
2309        assert!(!string_data_slice.ptr_eq(&string_data))
2310    }
2311
2312    #[test]
2313    fn test_slice_memory_size() {
2314        let mut bit_v: [u8; 2] = [0; 2];
2315        bit_util::set_bit(&mut bit_v, 0);
2316        bit_util::set_bit(&mut bit_v, 3);
2317        bit_util::set_bit(&mut bit_v, 10);
2318        let data = ArrayData::builder(DataType::Int32)
2319            .len(16)
2320            .add_buffer(make_i32_buffer(16))
2321            .null_bit_buffer(Some(Buffer::from(bit_v)))
2322            .build()
2323            .unwrap();
2324        let new_data = data.slice(1, 14);
2325        assert_eq!(
2326            data.get_slice_memory_size().unwrap() - 8,
2327            new_data.get_slice_memory_size().unwrap()
2328        );
2329        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2330        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2331        let string_data = ArrayData::try_new(
2332            DataType::Utf8,
2333            3,
2334            Some(Buffer::from_iter(vec![true, false, true])),
2335            0,
2336            vec![offsets_buffer, data_buffer],
2337            vec![],
2338        )
2339        .unwrap();
2340        let string_data_slice = string_data.slice(1, 2);
2341        //4 bytes of offset and 2 bytes of data reduced by slicing.
2342        assert_eq!(
2343            string_data.get_slice_memory_size().unwrap() - 6,
2344            string_data_slice.get_slice_memory_size().unwrap()
2345        );
2346    }
2347
2348    #[test]
2349    fn test_count_nulls() {
2350        let buffer = Buffer::from([0b00010110, 0b10011111]);
2351        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2352        let count = count_nulls(Some(&buffer), 0, 16);
2353        assert_eq!(count, 7);
2354
2355        let count = count_nulls(Some(&buffer), 4, 8);
2356        assert_eq!(count, 3);
2357    }
2358
2359    #[test]
2360    fn test_contains_nulls() {
2361        let buffer: Buffer =
2362            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2363        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2364        assert!(contains_nulls(Some(&buffer), 0, 6));
2365        assert!(contains_nulls(Some(&buffer), 0, 3));
2366        assert!(!contains_nulls(Some(&buffer), 3, 2));
2367        assert!(!contains_nulls(Some(&buffer), 0, 0));
2368    }
2369
2370    #[test]
2371    fn test_alignment() {
2372        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2373        let sliced = buffer.slice(1);
2374
2375        let mut data = ArrayData {
2376            data_type: DataType::Int32,
2377            len: 0,
2378            offset: 0,
2379            buffers: vec![buffer],
2380            child_data: vec![],
2381            nulls: None,
2382        };
2383        data.validate_full().unwrap();
2384
2385        // break alignment in data
2386        data.buffers[0] = sliced;
2387        let err = data.validate().unwrap_err();
2388
2389        assert_eq!(
2390            err.to_string(),
2391            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2392        );
2393
2394        data.align_buffers();
2395        data.validate_full().unwrap();
2396    }
2397
2398    #[test]
2399    fn test_alignment_struct() {
2400        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2401        let sliced = buffer.slice(1);
2402
2403        let child_data = ArrayData {
2404            data_type: DataType::Int32,
2405            len: 0,
2406            offset: 0,
2407            buffers: vec![buffer],
2408            child_data: vec![],
2409            nulls: None,
2410        };
2411
2412        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2413        let mut data = ArrayData {
2414            data_type: schema,
2415            len: 0,
2416            offset: 0,
2417            buffers: vec![],
2418            child_data: vec![child_data],
2419            nulls: None,
2420        };
2421        data.validate_full().unwrap();
2422
2423        // break alignment in child data
2424        data.child_data[0].buffers[0] = sliced;
2425        let err = data.validate().unwrap_err();
2426
2427        assert_eq!(
2428            err.to_string(),
2429            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2430        );
2431
2432        data.align_buffers();
2433        data.validate_full().unwrap();
2434    }
2435
2436    #[test]
2437    fn test_null_view_types() {
2438        let array_len = 32;
2439        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2440        assert_eq!(array.len(), array_len);
2441        for i in 0..array.len() {
2442            assert!(array.is_null(i));
2443        }
2444
2445        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2446        assert_eq!(array.len(), array_len);
2447        for i in 0..array.len() {
2448            assert!(array.is_null(i));
2449        }
2450    }
2451}