arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256impl ArrayData {
257    /// Create a new ArrayData instance;
258    ///
259    /// If `null_count` is not specified, the number of nulls in
260    /// null_bit_buffer is calculated.
261    ///
262    /// If the number of nulls is 0 then the null_bit_buffer
263    /// is set to `None`.
264    ///
265    /// # Safety
266    ///
267    /// The input values *must* form a valid Arrow array for
268    /// `data_type`, or undefined behavior can result.
269    ///
270    /// Note: This is a low level API and most users of the arrow
271    /// crate should create arrays using the methods in the `array`
272    /// module.
273    pub unsafe fn new_unchecked(
274        data_type: DataType,
275        len: usize,
276        null_count: Option<usize>,
277        null_bit_buffer: Option<Buffer>,
278        offset: usize,
279        buffers: Vec<Buffer>,
280        child_data: Vec<ArrayData>,
281    ) -> Self {
282        let mut skip_validation = UnsafeFlag::new();
283        // SAFETY: caller responsible for ensuring data is valid
284        skip_validation.set(true);
285
286        ArrayDataBuilder {
287            data_type,
288            len,
289            null_count,
290            null_bit_buffer,
291            nulls: None,
292            offset,
293            buffers,
294            child_data,
295            align_buffers: false,
296            skip_validation,
297        }
298        .build()
299        .unwrap()
300    }
301
302    /// Create a new ArrayData, validating that the provided buffers form a valid
303    /// Arrow array of the specified data type.
304    ///
305    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306    /// is set to `None`.
307    ///
308    /// Internally this calls through to [`Self::validate_data`]
309    ///
310    /// Note: This is a low level API and most users of the arrow crate should create
311    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312    pub fn try_new(
313        data_type: DataType,
314        len: usize,
315        null_bit_buffer: Option<Buffer>,
316        offset: usize,
317        buffers: Vec<Buffer>,
318        child_data: Vec<ArrayData>,
319    ) -> Result<Self, ArrowError> {
320        // we must check the length of `null_bit_buffer` first
321        // because we use this buffer to calculate `null_count`
322        // in `Self::new_unchecked`.
323        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324            let needed_len = bit_util::ceil(len + offset, 8);
325            if null_bit_buffer.len() < needed_len {
326                return Err(ArrowError::InvalidArgumentError(format!(
327                    "null_bit_buffer size too small. got {} needed {}",
328                    null_bit_buffer.len(),
329                    needed_len
330                )));
331            }
332        }
333        // Safety justification: `validate_full` is called below
334        let new_self = unsafe {
335            Self::new_unchecked(
336                data_type,
337                len,
338                None,
339                null_bit_buffer,
340                offset,
341                buffers,
342                child_data,
343            )
344        };
345
346        // As the data is not trusted, do a full validation of its contents
347        // We don't need to validate children as we can assume that the
348        // [`ArrayData`] in `child_data` have already been validated through
349        // a call to `ArrayData::try_new` or created using unsafe
350        new_self.validate_data()?;
351        Ok(new_self)
352    }
353
354    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
355    #[inline]
356    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357        ArrayDataBuilder::new(data_type)
358    }
359
360    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
361    #[inline]
362    pub const fn data_type(&self) -> &DataType {
363        &self.data_type
364    }
365
366    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
367    pub fn buffers(&self) -> &[Buffer] {
368        &self.buffers
369    }
370
371    /// Returns a slice of children [`ArrayData`]. This will be non
372    /// empty for type such as lists and structs.
373    pub fn child_data(&self) -> &[ArrayData] {
374        &self.child_data[..]
375    }
376
377    /// Returns whether the element at index `i` is null
378    #[inline]
379    pub fn is_null(&self, i: usize) -> bool {
380        match &self.nulls {
381            Some(v) => v.is_null(i),
382            None => false,
383        }
384    }
385
386    /// Returns a reference to the null buffer of this [`ArrayData`] if any
387    ///
388    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
389    #[inline]
390    pub fn nulls(&self) -> Option<&NullBuffer> {
391        self.nulls.as_ref()
392    }
393
394    /// Returns whether the element at index `i` is not null
395    #[inline]
396    pub fn is_valid(&self, i: usize) -> bool {
397        !self.is_null(i)
398    }
399
400    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
401    #[inline]
402    pub const fn len(&self) -> usize {
403        self.len
404    }
405
406    /// Returns whether this [`ArrayData`] is empty
407    #[inline]
408    pub const fn is_empty(&self) -> bool {
409        self.len == 0
410    }
411
412    /// Returns the offset of this [`ArrayData`]
413    #[inline]
414    pub const fn offset(&self) -> usize {
415        self.offset
416    }
417
418    /// Returns the total number of nulls in this array
419    #[inline]
420    pub fn null_count(&self) -> usize {
421        self.nulls
422            .as_ref()
423            .map(|x| x.null_count())
424            .unwrap_or_default()
425    }
426
427    /// Returns the total number of bytes of memory occupied by the
428    /// buffers owned by this [`ArrayData`] and all of its
429    /// children. (See also diagram on [`ArrayData`]).
430    ///
431    /// Note that this [`ArrayData`] may only refer to a subset of the
432    /// data in the underlying [`Buffer`]s (due to `offset` and
433    /// `length`), but the size returned includes the entire size of
434    /// the buffers.
435    ///
436    /// If multiple [`ArrayData`]s refer to the same underlying
437    /// [`Buffer`]s they will both report the same size.
438    pub fn get_buffer_memory_size(&self) -> usize {
439        let mut size = 0;
440        for buffer in &self.buffers {
441            size += buffer.capacity();
442        }
443        if let Some(bitmap) = &self.nulls {
444            size += bitmap.buffer().capacity()
445        }
446        for child in &self.child_data {
447            size += child.get_buffer_memory_size();
448        }
449        size
450    }
451
452    /// Returns the total number of the bytes of memory occupied by
453    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
454    ///
455    /// This is approximately the number of bytes if a new
456    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
457    /// exactly the data needed.
458    ///
459    /// For example, a [`DataType::Int64`] with `100` elements,
460    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
461    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
462    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
463    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
464    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465        let mut result: usize = 0;
466        let layout = layout(&self.data_type);
467
468        for spec in layout.buffers.iter() {
469            match spec {
470                BufferSpec::FixedWidth { byte_width, .. } => {
471                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472                        ArrowError::ComputeError(
473                            "Integer overflow computing buffer size".to_string(),
474                        )
475                    })?;
476                    result += buffer_size;
477                }
478                BufferSpec::VariableWidth => {
479                    let buffer_len: usize;
480                    match self.data_type {
481                        DataType::Utf8 | DataType::Binary => {
482                            let offsets = self.typed_offsets::<i32>()?;
483                            buffer_len = (offsets[self.len] - offsets[0] ) as usize;
484                        }
485                        DataType::LargeUtf8 | DataType::LargeBinary => {
486                            let offsets = self.typed_offsets::<i64>()?;
487                            buffer_len = (offsets[self.len] - offsets[0]) as usize;
488                        }
489                        _ => {
490                            return Err(ArrowError::NotYetImplemented(format!(
491                            "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
492                            self.data_type
493                            )))
494                        }
495                    };
496                    result += buffer_len;
497                }
498                BufferSpec::BitMap => {
499                    let buffer_size = bit_util::ceil(self.len, 8);
500                    result += buffer_size;
501                }
502                BufferSpec::AlwaysNull => {
503                    // Nothing to do
504                }
505            }
506        }
507
508        if self.nulls().is_some() {
509            result += bit_util::ceil(self.len, 8);
510        }
511
512        for child in &self.child_data {
513            result += child.get_slice_memory_size()?;
514        }
515        Ok(result)
516    }
517
518    /// Returns the total number of bytes of memory occupied
519    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
520    /// children. (See also diagram on [`ArrayData`]).
521    ///
522    /// Equivalent to:
523    ///  `size_of_val(self)` +
524    ///  [`Self::get_buffer_memory_size`] +
525    ///  `size_of_val(child)` for all children
526    pub fn get_array_memory_size(&self) -> usize {
527        let mut size = mem::size_of_val(self);
528
529        // Calculate rest of the fields top down which contain actual data
530        for buffer in &self.buffers {
531            size += mem::size_of::<Buffer>();
532            size += buffer.capacity();
533        }
534        if let Some(nulls) = &self.nulls {
535            size += nulls.buffer().capacity();
536        }
537        for child in &self.child_data {
538            size += child.get_array_memory_size();
539        }
540
541        size
542    }
543
544    /// Creates a zero-copy slice of itself. This creates a new
545    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
546    /// different offset and len
547    ///
548    /// # Panics
549    ///
550    /// Panics if `offset + length > self.len()`.
551    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
552        assert!((offset + length) <= self.len());
553
554        if let DataType::Struct(_) = self.data_type() {
555            // Slice into children
556            let new_offset = self.offset + offset;
557            let new_data = ArrayData {
558                data_type: self.data_type().clone(),
559                len: length,
560                offset: new_offset,
561                buffers: self.buffers.clone(),
562                // Slice child data, to propagate offsets down to them
563                child_data: self
564                    .child_data()
565                    .iter()
566                    .map(|data| data.slice(offset, length))
567                    .collect(),
568                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
569            };
570
571            new_data
572        } else {
573            let mut new_data = self.clone();
574
575            new_data.len = length;
576            new_data.offset = offset + self.offset;
577            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
578
579            new_data
580        }
581    }
582
583    /// Returns the `buffer` as a slice of type `T` starting at self.offset
584    ///
585    /// # Panics
586    /// This function panics if:
587    /// * the buffer is not byte-aligned with type T, or
588    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
589    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
590        &self.buffers()[buffer].typed_data()[self.offset..]
591    }
592
593    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
594    pub fn new_null(data_type: &DataType, len: usize) -> Self {
595        let bit_len = bit_util::ceil(len, 8);
596        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
597
598        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
599            Some(width) => (vec![zeroed(width * len)], vec![], true),
600            None => match data_type {
601                DataType::Null => (vec![], vec![], false),
602                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
603                DataType::Binary | DataType::Utf8 => {
604                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
605                }
606                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
607                DataType::LargeBinary | DataType::LargeUtf8 => {
608                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
609                }
610                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
611                DataType::List(f) | DataType::Map(f, _) => (
612                    vec![zeroed((len + 1) * 4)],
613                    vec![ArrayData::new_empty(f.data_type())],
614                    true,
615                ),
616                DataType::LargeList(f) => (
617                    vec![zeroed((len + 1) * 8)],
618                    vec![ArrayData::new_empty(f.data_type())],
619                    true,
620                ),
621                DataType::FixedSizeList(f, list_len) => (
622                    vec![],
623                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
624                    true,
625                ),
626                DataType::Struct(fields) => (
627                    vec![],
628                    fields
629                        .iter()
630                        .map(|f| Self::new_null(f.data_type(), len))
631                        .collect(),
632                    true,
633                ),
634                DataType::Dictionary(k, v) => (
635                    vec![zeroed(k.primitive_width().unwrap() * len)],
636                    vec![ArrayData::new_empty(v.as_ref())],
637                    true,
638                ),
639                DataType::Union(f, mode) => {
640                    let (id, _) = f.iter().next().unwrap();
641                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
642                    let buffers = match mode {
643                        UnionMode::Sparse => vec![ids],
644                        UnionMode::Dense => {
645                            let end_offset = i32::from_usize(len).unwrap();
646                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
647                        }
648                    };
649
650                    let children = f
651                        .iter()
652                        .enumerate()
653                        .map(|(idx, (_, f))| {
654                            if idx == 0 || *mode == UnionMode::Sparse {
655                                Self::new_null(f.data_type(), len)
656                            } else {
657                                Self::new_empty(f.data_type())
658                            }
659                        })
660                        .collect();
661
662                    (buffers, children, false)
663                }
664                DataType::RunEndEncoded(r, v) => {
665                    let runs = match r.data_type() {
666                        DataType::Int16 => {
667                            let i = i16::from_usize(len).expect("run overflow");
668                            Buffer::from_slice_ref([i])
669                        }
670                        DataType::Int32 => {
671                            let i = i32::from_usize(len).expect("run overflow");
672                            Buffer::from_slice_ref([i])
673                        }
674                        DataType::Int64 => {
675                            let i = i64::from_usize(len).expect("run overflow");
676                            Buffer::from_slice_ref([i])
677                        }
678                        dt => unreachable!("Invalid run ends data type {dt}"),
679                    };
680
681                    let builder = ArrayData::builder(r.data_type().clone())
682                        .len(1)
683                        .buffers(vec![runs]);
684
685                    // SAFETY:
686                    // Valid by construction
687                    let runs = unsafe { builder.build_unchecked() };
688                    (
689                        vec![],
690                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
691                        false,
692                    )
693                }
694                d => unreachable!("{d}"),
695            },
696        };
697
698        let mut builder = ArrayDataBuilder::new(data_type.clone())
699            .len(len)
700            .buffers(buffers)
701            .child_data(child_data);
702
703        if has_nulls {
704            builder = builder.nulls(Some(NullBuffer::new_null(len)))
705        }
706
707        // SAFETY:
708        // Data valid by construction
709        unsafe { builder.build_unchecked() }
710    }
711
712    /// Returns a new empty [ArrayData] valid for `data_type`.
713    pub fn new_empty(data_type: &DataType) -> Self {
714        Self::new_null(data_type, 0)
715    }
716
717    /// Verifies that the buffers meet the minimum alignment requirements for the data type
718    ///
719    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
720    ///
721    /// This can be useful for when interacting with data sent over IPC or FFI, that may
722    /// not meet the minimum alignment requirements
723    ///
724    /// This also aligns buffers of children data
725    pub fn align_buffers(&mut self) {
726        let layout = layout(&self.data_type);
727        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
728            if let BufferSpec::FixedWidth { alignment, .. } = spec {
729                if buffer.as_ptr().align_offset(*alignment) != 0 {
730                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
731                }
732            }
733        }
734        // align children data recursively
735        for data in self.child_data.iter_mut() {
736            data.align_buffers()
737        }
738    }
739
740    /// "cheap" validation of an `ArrayData`. Ensures buffers are
741    /// sufficiently sized to store `len` + `offset` total elements of
742    /// `data_type` and performs other inexpensive consistency checks.
743    ///
744    /// This check is "cheap" in the sense that it does not validate the
745    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
746    /// are within the bounds of the values buffer).
747    ///
748    /// See [ArrayData::validate_data] to validate fully the offset content
749    /// and the validity of utf8 data
750    pub fn validate(&self) -> Result<(), ArrowError> {
751        // Need at least this mich space in each buffer
752        let len_plus_offset = self.len + self.offset;
753
754        // Check that the data layout conforms to the spec
755        let layout = layout(&self.data_type);
756
757        if !layout.can_contain_null_mask && self.nulls.is_some() {
758            return Err(ArrowError::InvalidArgumentError(format!(
759                "Arrays of type {:?} cannot contain a null bitmask",
760                self.data_type,
761            )));
762        }
763
764        // Check data buffers length for view types and other types
765        if self.buffers.len() < layout.buffers.len()
766            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
767        {
768            return Err(ArrowError::InvalidArgumentError(format!(
769                "Expected {} buffers in array of type {:?}, got {}",
770                layout.buffers.len(),
771                self.data_type,
772                self.buffers.len(),
773            )));
774        }
775
776        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
777            match spec {
778                BufferSpec::FixedWidth {
779                    byte_width,
780                    alignment,
781                } => {
782                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
783
784                    if buffer.len() < min_buffer_size {
785                        return Err(ArrowError::InvalidArgumentError(format!(
786                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
787                            min_buffer_size, i, self.data_type, buffer.len()
788                        )));
789                    }
790
791                    let align_offset = buffer.as_ptr().align_offset(*alignment);
792                    if align_offset != 0 {
793                        return Err(ArrowError::InvalidArgumentError(format!(
794                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795                            self.data_type, align_offset.min(alignment - align_offset)
796                        )));
797                    }
798                }
799                BufferSpec::VariableWidth => {
800                    // not cheap to validate (need to look at the
801                    // data). Partially checked in validate_offsets
802                    // called below. Can check with `validate_full`
803                }
804                BufferSpec::BitMap => {
805                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
806                    if buffer.len() < min_buffer_size {
807                        return Err(ArrowError::InvalidArgumentError(format!(
808                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
809                            min_buffer_size, i, self.data_type, buffer.len()
810                        )));
811                    }
812                }
813                BufferSpec::AlwaysNull => {
814                    // Nothing to validate
815                }
816            }
817        }
818
819        // check null bit buffer size
820        if let Some(nulls) = self.nulls() {
821            if nulls.null_count() > self.len {
822                return Err(ArrowError::InvalidArgumentError(format!(
823                    "null_count {} for an array exceeds length of {} elements",
824                    nulls.null_count(),
825                    self.len
826                )));
827            }
828
829            let actual_len = nulls.validity().len();
830            let needed_len = bit_util::ceil(len_plus_offset, 8);
831            if actual_len < needed_len {
832                return Err(ArrowError::InvalidArgumentError(format!(
833                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
834                )));
835            }
836
837            if nulls.len() != self.len {
838                return Err(ArrowError::InvalidArgumentError(format!(
839                    "null buffer incorrect size. got {} expected {}",
840                    nulls.len(),
841                    self.len
842                )));
843            }
844        }
845
846        self.validate_child_data()?;
847
848        // Additional Type specific checks
849        match &self.data_type {
850            DataType::Utf8 | DataType::Binary => {
851                self.validate_offsets::<i32>(self.buffers[1].len())?;
852            }
853            DataType::LargeUtf8 | DataType::LargeBinary => {
854                self.validate_offsets::<i64>(self.buffers[1].len())?;
855            }
856            DataType::Dictionary(key_type, _value_type) => {
857                // At the moment, constructing a DictionaryArray will also check this
858                if !DataType::is_dictionary_key_type(key_type) {
859                    return Err(ArrowError::InvalidArgumentError(format!(
860                        "Dictionary key type must be integer, but was {key_type}"
861                    )));
862                }
863            }
864            DataType::RunEndEncoded(run_ends_type, _) => {
865                if run_ends_type.is_nullable() {
866                    return Err(ArrowError::InvalidArgumentError(
867                        "The nullable should be set to false for the field defining run_ends array.".to_string()
868                    ));
869                }
870                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
871                    return Err(ArrowError::InvalidArgumentError(format!(
872                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
873                        run_ends_type.data_type()
874                    )));
875                }
876            }
877            _ => {}
878        };
879
880        Ok(())
881    }
882
883    /// Returns a reference to the data in `buffer` as a typed slice
884    /// (typically `&[i32]` or `&[i64]`) after validating. The
885    /// returned slice is guaranteed to have at least `self.len + 1`
886    /// entries.
887    ///
888    /// For an empty array, the `buffer` can also be empty.
889    fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
890        // An empty list-like array can have 0 offsets
891        if self.len == 0 && self.buffers[0].is_empty() {
892            return Ok(&[]);
893        }
894
895        self.typed_buffer(0, self.len + 1)
896    }
897
898    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
899    fn typed_buffer<T: ArrowNativeType + num::Num>(
900        &self,
901        idx: usize,
902        len: usize,
903    ) -> Result<&[T], ArrowError> {
904        let buffer = &self.buffers[idx];
905
906        let required_len = (len + self.offset) * mem::size_of::<T>();
907
908        if buffer.len() < required_len {
909            return Err(ArrowError::InvalidArgumentError(format!(
910                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
911                idx,
912                self.data_type,
913                required_len,
914                buffer.len()
915            )));
916        }
917
918        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
919    }
920
921    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
922    /// offsets (of type T) into some other buffer of `values_length` bytes long
923    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
924        &self,
925        values_length: usize,
926    ) -> Result<(), ArrowError> {
927        // Justification: buffer size was validated above
928        let offsets = self.typed_offsets::<T>()?;
929        if offsets.is_empty() {
930            return Ok(());
931        }
932
933        let first_offset = offsets[0].to_usize().ok_or_else(|| {
934            ArrowError::InvalidArgumentError(format!(
935                "Error converting offset[0] ({}) to usize for {}",
936                offsets[0], self.data_type
937            ))
938        })?;
939
940        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
941            ArrowError::InvalidArgumentError(format!(
942                "Error converting offset[{}] ({}) to usize for {}",
943                self.len, offsets[self.len], self.data_type
944            ))
945        })?;
946
947        if first_offset > values_length {
948            return Err(ArrowError::InvalidArgumentError(format!(
949                "First offset {} of {} is larger than values length {}",
950                first_offset, self.data_type, values_length,
951            )));
952        }
953
954        if last_offset > values_length {
955            return Err(ArrowError::InvalidArgumentError(format!(
956                "Last offset {} of {} is larger than values length {}",
957                last_offset, self.data_type, values_length,
958            )));
959        }
960
961        if first_offset > last_offset {
962            return Err(ArrowError::InvalidArgumentError(format!(
963                "First offset {} in {} is smaller than last offset {}",
964                first_offset, self.data_type, last_offset,
965            )));
966        }
967
968        Ok(())
969    }
970
971    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
972    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
973    fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
974        &self,
975        values_length: usize,
976    ) -> Result<(), ArrowError> {
977        let offsets: &[T] = self.typed_buffer(0, self.len)?;
978        let sizes: &[T] = self.typed_buffer(1, self.len)?;
979        for i in 0..values_length {
980            let size = sizes[i].to_usize().ok_or_else(|| {
981                ArrowError::InvalidArgumentError(format!(
982                    "Error converting size[{}] ({}) to usize for {}",
983                    i, sizes[i], self.data_type
984                ))
985            })?;
986            let offset = offsets[i].to_usize().ok_or_else(|| {
987                ArrowError::InvalidArgumentError(format!(
988                    "Error converting offset[{}] ({}) to usize for {}",
989                    i, offsets[i], self.data_type
990                ))
991            })?;
992            if size
993                .checked_add(offset)
994                .expect("Offset and size have exceeded the usize boundary")
995                > values_length
996            {
997                return Err(ArrowError::InvalidArgumentError(format!(
998                    "Size {} at index {} is larger than the remaining values for {}",
999                    size, i, self.data_type
1000                )));
1001            }
1002        }
1003        Ok(())
1004    }
1005
1006    /// Validates the layout of `child_data` ArrayData structures
1007    fn validate_child_data(&self) -> Result<(), ArrowError> {
1008        match &self.data_type {
1009            DataType::List(field) | DataType::Map(field, _) => {
1010                let values_data = self.get_single_valid_child_data(field.data_type())?;
1011                self.validate_offsets::<i32>(values_data.len)?;
1012                Ok(())
1013            }
1014            DataType::LargeList(field) => {
1015                let values_data = self.get_single_valid_child_data(field.data_type())?;
1016                self.validate_offsets::<i64>(values_data.len)?;
1017                Ok(())
1018            }
1019            DataType::ListView(field) => {
1020                let values_data = self.get_single_valid_child_data(field.data_type())?;
1021                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1022                Ok(())
1023            }
1024            DataType::LargeListView(field) => {
1025                let values_data = self.get_single_valid_child_data(field.data_type())?;
1026                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1027                Ok(())
1028            }
1029            DataType::FixedSizeList(field, list_size) => {
1030                let values_data = self.get_single_valid_child_data(field.data_type())?;
1031
1032                let list_size: usize = (*list_size).try_into().map_err(|_| {
1033                    ArrowError::InvalidArgumentError(format!(
1034                        "{} has a negative list_size {}",
1035                        self.data_type, list_size
1036                    ))
1037                })?;
1038
1039                let expected_values_len = self.len
1040                    .checked_mul(list_size)
1041                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1042
1043                if values_data.len < expected_values_len {
1044                    return Err(ArrowError::InvalidArgumentError(format!(
1045                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1046                        values_data.len, self.len, list_size, self.data_type
1047                    )));
1048                }
1049
1050                Ok(())
1051            }
1052            DataType::Struct(fields) => {
1053                self.validate_num_child_data(fields.len())?;
1054                for (i, field) in fields.iter().enumerate() {
1055                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1056
1057                    // Ensure child field has sufficient size
1058                    if field_data.len < self.len {
1059                        return Err(ArrowError::InvalidArgumentError(format!(
1060                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1061                            self.data_type, i, field.name(), field_data.len, self.len
1062                        )));
1063                    }
1064                }
1065                Ok(())
1066            }
1067            DataType::RunEndEncoded(run_ends_field, values_field) => {
1068                self.validate_num_child_data(2)?;
1069                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1070                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1071                if run_ends_data.len != values_data.len {
1072                    return Err(ArrowError::InvalidArgumentError(format!(
1073                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1074                        run_ends_data.len, values_data.len
1075                    )));
1076                }
1077                if run_ends_data.nulls.is_some() {
1078                    return Err(ArrowError::InvalidArgumentError(
1079                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1080                    ));
1081                }
1082                Ok(())
1083            }
1084            DataType::Union(fields, mode) => {
1085                self.validate_num_child_data(fields.len())?;
1086
1087                for (i, (_, field)) in fields.iter().enumerate() {
1088                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1089
1090                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1091                        return Err(ArrowError::InvalidArgumentError(format!(
1092                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1093                            i, field_data.len, self.len + self.offset
1094                        )));
1095                    }
1096                }
1097                Ok(())
1098            }
1099            DataType::Dictionary(_key_type, value_type) => {
1100                self.get_single_valid_child_data(value_type)?;
1101                Ok(())
1102            }
1103            _ => {
1104                // other types do not have child data
1105                if !self.child_data.is_empty() {
1106                    return Err(ArrowError::InvalidArgumentError(format!(
1107                        "Expected no child arrays for type {} but got {}",
1108                        self.data_type,
1109                        self.child_data.len()
1110                    )));
1111                }
1112                Ok(())
1113            }
1114        }
1115    }
1116
1117    /// Ensures that this array data has a single child_data with the
1118    /// expected type, and calls `validate()` on it. Returns a
1119    /// reference to that child_data
1120    fn get_single_valid_child_data(
1121        &self,
1122        expected_type: &DataType,
1123    ) -> Result<&ArrayData, ArrowError> {
1124        self.validate_num_child_data(1)?;
1125        self.get_valid_child_data(0, expected_type)
1126    }
1127
1128    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1129    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1130        if self.child_data.len() != expected_len {
1131            Err(ArrowError::InvalidArgumentError(format!(
1132                "Value data for {} should contain {} child data array(s), had {}",
1133                self.data_type,
1134                expected_len,
1135                self.child_data.len()
1136            )))
1137        } else {
1138            Ok(())
1139        }
1140    }
1141
1142    /// Ensures that `child_data[i]` has the expected type, calls
1143    /// `validate()` on it, and returns a reference to that child_data
1144    fn get_valid_child_data(
1145        &self,
1146        i: usize,
1147        expected_type: &DataType,
1148    ) -> Result<&ArrayData, ArrowError> {
1149        let values_data = self.child_data.get(i).ok_or_else(|| {
1150            ArrowError::InvalidArgumentError(format!(
1151                "{} did not have enough child arrays. Expected at least {} but had only {}",
1152                self.data_type,
1153                i + 1,
1154                self.child_data.len()
1155            ))
1156        })?;
1157
1158        if expected_type != &values_data.data_type {
1159            return Err(ArrowError::InvalidArgumentError(format!(
1160                "Child type mismatch for {}. Expected {} but child data had {}",
1161                self.data_type, expected_type, values_data.data_type
1162            )));
1163        }
1164
1165        values_data.validate()?;
1166        Ok(values_data)
1167    }
1168
1169    /// Validate that the data contained within this [`ArrayData`] is valid
1170    ///
1171    /// 1. Null count is correct
1172    /// 2. All offsets are valid
1173    /// 3. All String data is valid UTF-8
1174    /// 4. All dictionary offsets are valid
1175    ///
1176    /// Internally this calls:
1177    ///
1178    /// * [`Self::validate`]
1179    /// * [`Self::validate_nulls`]
1180    /// * [`Self::validate_values`]
1181    ///
1182    /// Note: this does not recurse into children, for a recursive variant
1183    /// see [`Self::validate_full`]
1184    pub fn validate_data(&self) -> Result<(), ArrowError> {
1185        self.validate()?;
1186
1187        self.validate_nulls()?;
1188        self.validate_values()?;
1189        Ok(())
1190    }
1191
1192    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1193    ///
1194    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1195    /// and all its children recursively
1196    pub fn validate_full(&self) -> Result<(), ArrowError> {
1197        self.validate_data()?;
1198        // validate all children recursively
1199        self.child_data
1200            .iter()
1201            .enumerate()
1202            .try_for_each(|(i, child_data)| {
1203                child_data.validate_full().map_err(|e| {
1204                    ArrowError::InvalidArgumentError(format!(
1205                        "{} child #{} invalid: {}",
1206                        self.data_type, i, e
1207                    ))
1208                })
1209            })?;
1210        Ok(())
1211    }
1212
1213    /// Validates the values stored within this [`ArrayData`] are valid
1214    /// without recursing into child [`ArrayData`]
1215    ///
1216    /// Does not (yet) check
1217    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1218    /// 2. the the null count is correct and that any
1219    /// 3. nullability requirements of its children are correct
1220    ///
1221    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1222    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1223        if let Some(nulls) = &self.nulls {
1224            let actual = nulls.len() - nulls.inner().count_set_bits();
1225            if actual != nulls.null_count() {
1226                return Err(ArrowError::InvalidArgumentError(format!(
1227                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1228                    nulls.null_count(),
1229                    actual
1230                )));
1231            }
1232        }
1233
1234        // In general non-nullable children should not contain nulls, however, for certain
1235        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1236        // space in the child. As such we permit nulls in the children in the corresponding
1237        // positions for such types
1238        match &self.data_type {
1239            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1240                if !f.is_nullable() {
1241                    self.validate_non_nullable(None, &self.child_data[0])?
1242                }
1243            }
1244            DataType::FixedSizeList(field, len) => {
1245                let child = &self.child_data[0];
1246                if !field.is_nullable() {
1247                    match &self.nulls {
1248                        Some(nulls) => {
1249                            let element_len = *len as usize;
1250                            let expanded = nulls.expand(element_len);
1251                            self.validate_non_nullable(Some(&expanded), child)?;
1252                        }
1253                        None => self.validate_non_nullable(None, child)?,
1254                    }
1255                }
1256            }
1257            DataType::Struct(fields) => {
1258                for (field, child) in fields.iter().zip(&self.child_data) {
1259                    if !field.is_nullable() {
1260                        self.validate_non_nullable(self.nulls(), child)?
1261                    }
1262                }
1263            }
1264            _ => {}
1265        }
1266
1267        Ok(())
1268    }
1269
1270    /// Verifies that `child` contains no nulls not present in `mask`
1271    fn validate_non_nullable(
1272        &self,
1273        mask: Option<&NullBuffer>,
1274        child: &ArrayData,
1275    ) -> Result<(), ArrowError> {
1276        let mask = match mask {
1277            Some(mask) => mask,
1278            None => {
1279                return match child.null_count() {
1280                    0 => Ok(()),
1281                    _ => Err(ArrowError::InvalidArgumentError(format!(
1282                        "non-nullable child of type {} contains nulls not present in parent {}",
1283                        child.data_type, self.data_type
1284                    ))),
1285                }
1286            }
1287        };
1288
1289        match child.nulls() {
1290            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1291                "non-nullable child of type {} contains nulls not present in parent",
1292                child.data_type
1293            ))),
1294            _ => Ok(()),
1295        }
1296    }
1297
1298    /// Validates the values stored within this [`ArrayData`] are valid
1299    /// without recursing into child [`ArrayData`]
1300    ///
1301    /// Does not (yet) check
1302    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1303    pub fn validate_values(&self) -> Result<(), ArrowError> {
1304        match &self.data_type {
1305            DataType::Utf8 => self.validate_utf8::<i32>(),
1306            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1307            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1308            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1309            DataType::BinaryView => {
1310                let views = self.typed_buffer::<u128>(0, self.len)?;
1311                validate_binary_view(views, &self.buffers[1..])
1312            }
1313            DataType::Utf8View => {
1314                let views = self.typed_buffer::<u128>(0, self.len)?;
1315                validate_string_view(views, &self.buffers[1..])
1316            }
1317            DataType::List(_) | DataType::Map(_, _) => {
1318                let child = &self.child_data[0];
1319                self.validate_offsets_full::<i32>(child.len)
1320            }
1321            DataType::LargeList(_) => {
1322                let child = &self.child_data[0];
1323                self.validate_offsets_full::<i64>(child.len)
1324            }
1325            DataType::Union(_, _) => {
1326                // Validate Union Array as part of implementing new Union semantics
1327                // See comments in `ArrayData::validate()`
1328                // https://github.com/apache/arrow-rs/issues/85
1329                //
1330                // TODO file follow on ticket for full union validation
1331                Ok(())
1332            }
1333            DataType::Dictionary(key_type, _value_type) => {
1334                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1335                let max_value = dictionary_length - 1;
1336                match key_type.as_ref() {
1337                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1338                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1339                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1340                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1341                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1342                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1343                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1344                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1345                    _ => unreachable!(),
1346                }
1347            }
1348            DataType::RunEndEncoded(run_ends, _values) => {
1349                let run_ends_data = self.child_data()[0].clone();
1350                match run_ends.data_type() {
1351                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1352                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1353                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1354                    _ => unreachable!(),
1355                }
1356            }
1357            _ => {
1358                // No extra validation check required for other types
1359                Ok(())
1360            }
1361        }
1362    }
1363
1364    /// Calls the `validate(item_index, range)` function for each of
1365    /// the ranges specified in the arrow offsets buffer of type
1366    /// `T`. Also validates that each offset is smaller than
1367    /// `offset_limit`
1368    ///
1369    /// For an empty array, the offsets buffer can either be empty
1370    /// or contain a single `0`.
1371    ///
1372    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1373    /// function would call `validate([1,2])`, and `validate([2,4])`
1374    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1375    where
1376        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1377        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1378    {
1379        self.typed_offsets::<T>()?
1380            .iter()
1381            .enumerate()
1382            .map(|(i, x)| {
1383                // check if the offset can be converted to usize
1384                let r = x.to_usize().ok_or_else(|| {
1385                    ArrowError::InvalidArgumentError(format!(
1386                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1387                    );
1388                // check if the offset exceeds the limit
1389                match r {
1390                    Ok(n) if n <= offset_limit => Ok((i, n)),
1391                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1392                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1393                    ),
1394                    Err(e) => Err(e),
1395                }
1396            })
1397            .scan(0_usize, |start, end| {
1398                // check offsets are monotonically increasing
1399                match end {
1400                    Ok((i, end)) if *start <= end => {
1401                        let range = Some(Ok((i, *start..end)));
1402                        *start = end;
1403                        range
1404                    }
1405                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1406                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1407                        i - 1, start, end))
1408                    )),
1409                    Err(err) => Some(Err(err)),
1410                }
1411            })
1412            .skip(1) // the first element is meaningless
1413            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1414                let (item_index, range) = res?;
1415                validate(item_index-1, range)
1416            })
1417    }
1418
1419    /// Ensures that all strings formed by the offsets in `buffers[0]`
1420    /// into `buffers[1]` are valid utf8 sequences
1421    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1422    where
1423        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1424    {
1425        let values_buffer = &self.buffers[1].as_slice();
1426        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1427            // Validate Offsets are correct
1428            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1429                if !values_str.is_char_boundary(range.start)
1430                    || !values_str.is_char_boundary(range.end)
1431                {
1432                    return Err(ArrowError::InvalidArgumentError(format!(
1433                        "incomplete utf-8 byte sequence from index {string_index}"
1434                    )));
1435                }
1436                Ok(())
1437            })
1438        } else {
1439            // find specific offset that failed utf8 validation
1440            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1441                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1442                    ArrowError::InvalidArgumentError(format!(
1443                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1444                    ))
1445                })?;
1446                Ok(())
1447            })
1448        }
1449    }
1450
1451    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1452    /// between `0` and `offset_limit`
1453    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1454    where
1455        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1456    {
1457        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1458            // No validation applied to each value, but the iteration
1459            // itself applies bounds checking to each range
1460            Ok(())
1461        })
1462    }
1463
1464    /// Validates that each value in self.buffers (typed as T)
1465    /// is within the range [0, max_value], inclusive
1466    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1467    where
1468        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1469    {
1470        let required_len = self.len + self.offset;
1471        let buffer = &self.buffers[0];
1472
1473        // This should have been checked as part of `validate()` prior
1474        // to calling `validate_full()` but double check to be sure
1475        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1476
1477        // Justification: buffer size was validated above
1478        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1479
1480        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1481            // Do not check the value is null (value can be arbitrary)
1482            if self.is_null(i) {
1483                return Ok(());
1484            }
1485            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1486                ArrowError::InvalidArgumentError(format!(
1487                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1488                ))
1489            })?;
1490
1491            if dict_index < 0 || dict_index > max_value {
1492                return Err(ArrowError::InvalidArgumentError(format!(
1493                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1494                )));
1495            }
1496            Ok(())
1497        })
1498    }
1499
1500    /// Validates that each value in run_ends array is positive and strictly increasing.
1501    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1502    where
1503        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1504    {
1505        let values = self.typed_buffer::<T>(0, self.len)?;
1506        let mut prev_value: i64 = 0_i64;
1507        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1508            let value: i64 = inp_value.try_into().map_err(|_| {
1509                ArrowError::InvalidArgumentError(format!(
1510                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1511                ))
1512            })?;
1513            if value <= 0_i64 {
1514                return Err(ArrowError::InvalidArgumentError(format!(
1515                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1516                )));
1517            }
1518            if ix > 0 && value <= prev_value {
1519                return Err(ArrowError::InvalidArgumentError(format!(
1520                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1521                )));
1522            }
1523
1524            prev_value = value;
1525            Ok(())
1526        })?;
1527
1528        if prev_value.as_usize() < (self.offset + self.len) {
1529            return Err(ArrowError::InvalidArgumentError(format!(
1530                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1531                self.offset + self.len
1532            )));
1533        }
1534        Ok(())
1535    }
1536
1537    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1538    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1539    /// return false when the arrays are logically equal
1540    pub fn ptr_eq(&self, other: &Self) -> bool {
1541        if self.offset != other.offset
1542            || self.len != other.len
1543            || self.data_type != other.data_type
1544            || self.buffers.len() != other.buffers.len()
1545            || self.child_data.len() != other.child_data.len()
1546        {
1547            return false;
1548        }
1549
1550        match (&self.nulls, &other.nulls) {
1551            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1552            (Some(_), None) | (None, Some(_)) => return false,
1553            _ => {}
1554        };
1555
1556        if !self
1557            .buffers
1558            .iter()
1559            .zip(other.buffers.iter())
1560            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1561        {
1562            return false;
1563        }
1564
1565        self.child_data
1566            .iter()
1567            .zip(other.child_data.iter())
1568            .all(|(a, b)| a.ptr_eq(b))
1569    }
1570
1571    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1572    pub fn into_builder(self) -> ArrayDataBuilder {
1573        self.into()
1574    }
1575}
1576
1577/// Return the expected [`DataTypeLayout`] Arrays of this data
1578/// type are expected to have
1579pub fn layout(data_type: &DataType) -> DataTypeLayout {
1580    // based on C/C++ implementation in
1581    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1582    use arrow_schema::IntervalUnit::*;
1583
1584    match data_type {
1585        DataType::Null => DataTypeLayout {
1586            buffers: vec![],
1587            can_contain_null_mask: false,
1588            variadic: false,
1589        },
1590        DataType::Boolean => DataTypeLayout {
1591            buffers: vec![BufferSpec::BitMap],
1592            can_contain_null_mask: true,
1593            variadic: false,
1594        },
1595        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1596        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1597        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1598        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1599        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1600        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1601        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1602        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1603        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1604        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1605        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1606        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1607        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1608        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1609        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1610        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1611        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1612        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1613        DataType::Interval(MonthDayNano) => {
1614            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1615        }
1616        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1617        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1618        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1619        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1620        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1621        DataType::FixedSizeBinary(size) => {
1622            let spec = BufferSpec::FixedWidth {
1623                byte_width: (*size).try_into().unwrap(),
1624                alignment: mem::align_of::<u8>(),
1625            };
1626            DataTypeLayout {
1627                buffers: vec![spec],
1628                can_contain_null_mask: true,
1629                variadic: false,
1630            }
1631        }
1632        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1633        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1634        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1635        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1636        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1637        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1638        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1639        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1640        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1641        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1642        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1643        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1644        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1645        DataType::Union(_, mode) => {
1646            let type_ids = BufferSpec::FixedWidth {
1647                byte_width: mem::size_of::<i8>(),
1648                alignment: mem::align_of::<i8>(),
1649            };
1650
1651            DataTypeLayout {
1652                buffers: match mode {
1653                    UnionMode::Sparse => {
1654                        vec![type_ids]
1655                    }
1656                    UnionMode::Dense => {
1657                        vec![
1658                            type_ids,
1659                            BufferSpec::FixedWidth {
1660                                byte_width: mem::size_of::<i32>(),
1661                                alignment: mem::align_of::<i32>(),
1662                            },
1663                        ]
1664                    }
1665                },
1666                can_contain_null_mask: false,
1667                variadic: false,
1668            }
1669        }
1670        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1671    }
1672}
1673
1674/// Layout specification for a data type
1675#[derive(Debug, PartialEq, Eq)]
1676// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1677pub struct DataTypeLayout {
1678    /// A vector of buffer layout specifications, one for each expected buffer
1679    pub buffers: Vec<BufferSpec>,
1680
1681    /// Can contain a null bitmask
1682    pub can_contain_null_mask: bool,
1683
1684    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1685    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1686    /// buffers.len(). Buffers that exceed the lower bound are legal.
1687    pub variadic: bool,
1688}
1689
1690impl DataTypeLayout {
1691    /// Describes a basic numeric array where each element has type `T`
1692    pub fn new_fixed_width<T>() -> Self {
1693        Self {
1694            buffers: vec![BufferSpec::FixedWidth {
1695                byte_width: mem::size_of::<T>(),
1696                alignment: mem::align_of::<T>(),
1697            }],
1698            can_contain_null_mask: true,
1699            variadic: false,
1700        }
1701    }
1702
1703    /// Describes arrays which have no data of their own
1704    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1705    pub fn new_nullable_empty() -> Self {
1706        Self {
1707            buffers: vec![],
1708            can_contain_null_mask: true,
1709            variadic: false,
1710        }
1711    }
1712
1713    /// Describes arrays which have no data of their own
1714    /// (e.g. RunEndEncoded).
1715    pub fn new_empty() -> Self {
1716        Self {
1717            buffers: vec![],
1718            can_contain_null_mask: false,
1719            variadic: false,
1720        }
1721    }
1722
1723    /// Describes a basic numeric array where each element has a fixed
1724    /// with offset buffer of type `T`, followed by a
1725    /// variable width data buffer
1726    pub fn new_binary<T>() -> Self {
1727        Self {
1728            buffers: vec![
1729                // offsets
1730                BufferSpec::FixedWidth {
1731                    byte_width: mem::size_of::<T>(),
1732                    alignment: mem::align_of::<T>(),
1733                },
1734                // values
1735                BufferSpec::VariableWidth,
1736            ],
1737            can_contain_null_mask: true,
1738            variadic: false,
1739        }
1740    }
1741
1742    /// Describes a view type
1743    pub fn new_view() -> Self {
1744        Self {
1745            buffers: vec![BufferSpec::FixedWidth {
1746                byte_width: mem::size_of::<u128>(),
1747                alignment: mem::align_of::<u128>(),
1748            }],
1749            can_contain_null_mask: true,
1750            variadic: true,
1751        }
1752    }
1753
1754    /// Describes a list view type
1755    pub fn new_list_view<T>() -> Self {
1756        Self {
1757            buffers: vec![
1758                BufferSpec::FixedWidth {
1759                    byte_width: mem::size_of::<T>(),
1760                    alignment: mem::align_of::<T>(),
1761                },
1762                BufferSpec::FixedWidth {
1763                    byte_width: mem::size_of::<T>(),
1764                    alignment: mem::align_of::<T>(),
1765                },
1766            ],
1767            can_contain_null_mask: true,
1768            variadic: true,
1769        }
1770    }
1771}
1772
1773/// Layout specification for a single data type buffer
1774#[derive(Debug, PartialEq, Eq)]
1775pub enum BufferSpec {
1776    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1777    ///
1778    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1779    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1780    ///
1781    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1782    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1783    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1784    ///
1785    /// Note that these alignment requirements will vary between architectures
1786    FixedWidth {
1787        /// The width of each element in bytes
1788        byte_width: usize,
1789        /// The alignment required by Rust for an array of the corresponding primitive
1790        alignment: usize,
1791    },
1792    /// Variable width, such as string data for utf8 data
1793    VariableWidth,
1794    /// Buffer holds a bitmap.
1795    ///
1796    /// Note: Unlike the C++ implementation, the null/validity buffer
1797    /// is handled specially rather than as another of the buffers in
1798    /// the spec, so this variant is only used for the Boolean type.
1799    BitMap,
1800    /// Buffer is always null. Unused currently in Rust implementation,
1801    /// (used in C++ for Union type)
1802    #[allow(dead_code)]
1803    AlwaysNull,
1804}
1805
1806impl PartialEq for ArrayData {
1807    fn eq(&self, other: &Self) -> bool {
1808        equal::equal(self, other)
1809    }
1810}
1811
1812/// A boolean flag that cannot be mutated outside of unsafe code.
1813///
1814/// Defaults to a value of false.
1815///
1816/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1817///
1818/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1819///
1820/// # Example
1821/// ```rust
1822/// use arrow_data::UnsafeFlag;
1823/// assert!(!UnsafeFlag::default().get()); // default is false
1824/// let mut flag = UnsafeFlag::new();
1825/// assert!(!flag.get()); // defaults to false
1826/// // can only set it to true in unsafe code
1827/// unsafe { flag.set(true) };
1828/// assert!(flag.get()); // now true
1829/// ```
1830#[derive(Debug, Clone)]
1831#[doc(hidden)]
1832pub struct UnsafeFlag(bool);
1833
1834impl UnsafeFlag {
1835    /// Creates a new `UnsafeFlag` with the value set to `false`.
1836    ///
1837    /// See examples on [`Self::new`]
1838    #[inline]
1839    pub const fn new() -> Self {
1840        Self(false)
1841    }
1842
1843    /// Sets the value of the flag to the given value
1844    ///
1845    /// Note this can purposely only be done in `unsafe` code
1846    ///
1847    /// # Safety
1848    ///
1849    /// If set, the flag will be set to the given value. There is nothing
1850    /// immediately unsafe about doing so, however, the flag can be used to
1851    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1852    #[inline]
1853    pub unsafe fn set(&mut self, val: bool) {
1854        self.0 = val;
1855    }
1856
1857    /// Returns the value of the flag
1858    #[inline]
1859    pub fn get(&self) -> bool {
1860        self.0
1861    }
1862}
1863
1864// Manual impl to make it clear you can not construct unsafe with true
1865impl Default for UnsafeFlag {
1866    fn default() -> Self {
1867        Self::new()
1868    }
1869}
1870
1871/// Builder for [`ArrayData`] type
1872#[derive(Debug)]
1873pub struct ArrayDataBuilder {
1874    data_type: DataType,
1875    len: usize,
1876    null_count: Option<usize>,
1877    null_bit_buffer: Option<Buffer>,
1878    nulls: Option<NullBuffer>,
1879    offset: usize,
1880    buffers: Vec<Buffer>,
1881    child_data: Vec<ArrayData>,
1882    /// Should buffers be realigned (copying if necessary)?
1883    ///
1884    /// Defaults to false.
1885    align_buffers: bool,
1886    /// Should data validation be skipped for this [`ArrayData`]?
1887    ///
1888    /// Defaults to false.
1889    ///
1890    /// # Safety
1891    ///
1892    /// This flag can only be set to true using `unsafe` APIs. However, once true
1893    /// subsequent calls to `build()` may result in undefined behavior if the data
1894    /// is not valid.
1895    skip_validation: UnsafeFlag,
1896}
1897
1898impl ArrayDataBuilder {
1899    #[inline]
1900    /// Creates a new array data builder
1901    pub const fn new(data_type: DataType) -> Self {
1902        Self {
1903            data_type,
1904            len: 0,
1905            null_count: None,
1906            null_bit_buffer: None,
1907            nulls: None,
1908            offset: 0,
1909            buffers: vec![],
1910            child_data: vec![],
1911            align_buffers: false,
1912            skip_validation: UnsafeFlag::new(),
1913        }
1914    }
1915
1916    /// Creates a new array data builder from an existing one, changing the data type
1917    pub fn data_type(self, data_type: DataType) -> Self {
1918        Self { data_type, ..self }
1919    }
1920
1921    #[inline]
1922    #[allow(clippy::len_without_is_empty)]
1923    /// Sets the length of the [ArrayData]
1924    pub const fn len(mut self, n: usize) -> Self {
1925        self.len = n;
1926        self
1927    }
1928
1929    /// Sets the null buffer of the [ArrayData]
1930    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1931        self.nulls = nulls;
1932        self.null_count = None;
1933        self.null_bit_buffer = None;
1934        self
1935    }
1936
1937    /// Sets the null count of the [ArrayData]
1938    pub fn null_count(mut self, null_count: usize) -> Self {
1939        self.null_count = Some(null_count);
1940        self
1941    }
1942
1943    /// Sets the `null_bit_buffer` of the [ArrayData]
1944    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1945        self.nulls = None;
1946        self.null_bit_buffer = buf;
1947        self
1948    }
1949
1950    /// Sets the offset of the [ArrayData]
1951    #[inline]
1952    pub const fn offset(mut self, n: usize) -> Self {
1953        self.offset = n;
1954        self
1955    }
1956
1957    /// Sets the buffers of the [ArrayData]
1958    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1959        self.buffers = v;
1960        self
1961    }
1962
1963    /// Adds a single buffer to the [ArrayData]'s buffers
1964    pub fn add_buffer(mut self, b: Buffer) -> Self {
1965        self.buffers.push(b);
1966        self
1967    }
1968
1969    /// Adds multiple buffers to the [ArrayData]'s buffers
1970    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1971        self.buffers.extend(bs);
1972        self
1973    }
1974
1975    /// Sets the child data of the [ArrayData]
1976    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1977        self.child_data = v;
1978        self
1979    }
1980
1981    /// Adds a single child data to the [ArrayData]'s child data
1982    pub fn add_child_data(mut self, r: ArrayData) -> Self {
1983        self.child_data.push(r);
1984        self
1985    }
1986
1987    /// Creates an array data, without any validation
1988    ///
1989    /// Note: This is shorthand for
1990    /// ```rust
1991    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
1992    /// # let _ = unsafe {
1993    /// builder.skip_validation(true).build().unwrap()
1994    /// # };
1995    /// ```
1996    ///
1997    /// # Safety
1998    ///
1999    /// The same caveats as [`ArrayData::new_unchecked`]
2000    /// apply.
2001    pub unsafe fn build_unchecked(self) -> ArrayData {
2002        self.skip_validation(true).build().unwrap()
2003    }
2004
2005    /// Creates an `ArrayData`, consuming `self`
2006    ///
2007    /// # Safety
2008    ///
2009    /// By default the underlying buffers are checked to ensure they are valid
2010    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2011    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2012    /// not valid, undefined behavior will result.
2013    pub fn build(self) -> Result<ArrayData, ArrowError> {
2014        let Self {
2015            data_type,
2016            len,
2017            null_count,
2018            null_bit_buffer,
2019            nulls,
2020            offset,
2021            buffers,
2022            child_data,
2023            align_buffers,
2024            skip_validation,
2025        } = self;
2026
2027        let nulls = nulls
2028            .or_else(|| {
2029                let buffer = null_bit_buffer?;
2030                let buffer = BooleanBuffer::new(buffer, offset, len);
2031                Some(match null_count {
2032                    Some(n) => {
2033                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2034                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2035                    }
2036                    None => NullBuffer::new(buffer),
2037                })
2038            })
2039            .filter(|b| b.null_count() != 0);
2040
2041        let mut data = ArrayData {
2042            data_type,
2043            len,
2044            offset,
2045            buffers,
2046            child_data,
2047            nulls,
2048        };
2049
2050        if align_buffers {
2051            data.align_buffers();
2052        }
2053
2054        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2055        if !skip_validation.get() || cfg!(feature = "force_validate") {
2056            data.validate_data()?;
2057        }
2058        Ok(data)
2059    }
2060
2061    /// Creates an array data, validating all inputs, and aligning any buffers
2062    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2063    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2064        self.align_buffers(true).build()
2065    }
2066
2067    /// Ensure that all buffers are aligned, copying data if necessary
2068    ///
2069    /// Rust requires that arrays are aligned to their corresponding primitive,
2070    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2071    ///
2072    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2073    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2074    ///
2075    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2076    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2077    /// when necessary, making it useful when interacting with buffers produced by other systems,
2078    /// e.g. IPC or FFI.
2079    ///
2080    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2081    /// insufficiently aligned buffers.
2082    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2083        self.align_buffers = align_buffers;
2084        self
2085    }
2086
2087    /// Skips validation of the data.
2088    ///
2089    /// If this flag is enabled, `[Self::build`] will skip validation of the
2090    /// data
2091    ///
2092    /// If this flag is not enabled, `[Self::build`] will validate that all
2093    /// buffers are valid and will return an error if any data is invalid.
2094    /// Validation can be expensive.
2095    ///
2096    /// # Safety
2097    ///
2098    /// If validation is skipped, the buffers must form a valid Arrow array,
2099    /// otherwise undefined behavior will result
2100    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2101        self.skip_validation.set(skip_validation);
2102        self
2103    }
2104}
2105
2106impl From<ArrayData> for ArrayDataBuilder {
2107    fn from(d: ArrayData) -> Self {
2108        Self {
2109            data_type: d.data_type,
2110            len: d.len,
2111            offset: d.offset,
2112            buffers: d.buffers,
2113            child_data: d.child_data,
2114            nulls: d.nulls,
2115            null_bit_buffer: None,
2116            null_count: None,
2117            align_buffers: false,
2118            skip_validation: UnsafeFlag::new(),
2119        }
2120    }
2121}
2122
2123#[cfg(test)]
2124mod tests {
2125    use super::*;
2126    use arrow_schema::{Field, Fields};
2127
2128    // See arrow/tests/array_data_validation.rs for test of array validation
2129
2130    /// returns a buffer initialized with some constant value for tests
2131    fn make_i32_buffer(n: usize) -> Buffer {
2132        Buffer::from_slice_ref(vec![42i32; n])
2133    }
2134
2135    /// returns a buffer initialized with some constant value for tests
2136    fn make_f32_buffer(n: usize) -> Buffer {
2137        Buffer::from_slice_ref(vec![42f32; n])
2138    }
2139
2140    #[test]
2141    fn test_builder() {
2142        // Buffer needs to be at least 25 long
2143        let v = (0..25).collect::<Vec<i32>>();
2144        let b1 = Buffer::from_slice_ref(&v);
2145        let arr_data = ArrayData::builder(DataType::Int32)
2146            .len(20)
2147            .offset(5)
2148            .add_buffer(b1)
2149            .null_bit_buffer(Some(Buffer::from([
2150                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2151            ])))
2152            .build()
2153            .unwrap();
2154
2155        assert_eq!(20, arr_data.len());
2156        assert_eq!(10, arr_data.null_count());
2157        assert_eq!(5, arr_data.offset());
2158        assert_eq!(1, arr_data.buffers().len());
2159        assert_eq!(
2160            Buffer::from_slice_ref(&v).as_slice(),
2161            arr_data.buffers()[0].as_slice()
2162        );
2163    }
2164
2165    #[test]
2166    fn test_builder_with_child_data() {
2167        let child_arr_data = ArrayData::try_new(
2168            DataType::Int32,
2169            5,
2170            None,
2171            0,
2172            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2173            vec![],
2174        )
2175        .unwrap();
2176
2177        let field = Arc::new(Field::new("x", DataType::Int32, true));
2178        let data_type = DataType::Struct(vec![field].into());
2179
2180        let arr_data = ArrayData::builder(data_type)
2181            .len(5)
2182            .offset(0)
2183            .add_child_data(child_arr_data.clone())
2184            .build()
2185            .unwrap();
2186
2187        assert_eq!(5, arr_data.len());
2188        assert_eq!(1, arr_data.child_data().len());
2189        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2190    }
2191
2192    #[test]
2193    fn test_null_count() {
2194        let mut bit_v: [u8; 2] = [0; 2];
2195        bit_util::set_bit(&mut bit_v, 0);
2196        bit_util::set_bit(&mut bit_v, 3);
2197        bit_util::set_bit(&mut bit_v, 10);
2198        let arr_data = ArrayData::builder(DataType::Int32)
2199            .len(16)
2200            .add_buffer(make_i32_buffer(16))
2201            .null_bit_buffer(Some(Buffer::from(bit_v)))
2202            .build()
2203            .unwrap();
2204        assert_eq!(13, arr_data.null_count());
2205
2206        // Test with offset
2207        let mut bit_v: [u8; 2] = [0; 2];
2208        bit_util::set_bit(&mut bit_v, 0);
2209        bit_util::set_bit(&mut bit_v, 3);
2210        bit_util::set_bit(&mut bit_v, 10);
2211        let arr_data = ArrayData::builder(DataType::Int32)
2212            .len(12)
2213            .offset(2)
2214            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2215            .null_bit_buffer(Some(Buffer::from(bit_v)))
2216            .build()
2217            .unwrap();
2218        assert_eq!(10, arr_data.null_count());
2219    }
2220
2221    #[test]
2222    fn test_null_buffer_ref() {
2223        let mut bit_v: [u8; 2] = [0; 2];
2224        bit_util::set_bit(&mut bit_v, 0);
2225        bit_util::set_bit(&mut bit_v, 3);
2226        bit_util::set_bit(&mut bit_v, 10);
2227        let arr_data = ArrayData::builder(DataType::Int32)
2228            .len(16)
2229            .add_buffer(make_i32_buffer(16))
2230            .null_bit_buffer(Some(Buffer::from(bit_v)))
2231            .build()
2232            .unwrap();
2233        assert!(arr_data.nulls().is_some());
2234        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2235    }
2236
2237    #[test]
2238    fn test_slice() {
2239        let mut bit_v: [u8; 2] = [0; 2];
2240        bit_util::set_bit(&mut bit_v, 0);
2241        bit_util::set_bit(&mut bit_v, 3);
2242        bit_util::set_bit(&mut bit_v, 10);
2243        let data = ArrayData::builder(DataType::Int32)
2244            .len(16)
2245            .add_buffer(make_i32_buffer(16))
2246            .null_bit_buffer(Some(Buffer::from(bit_v)))
2247            .build()
2248            .unwrap();
2249        let new_data = data.slice(1, 15);
2250        assert_eq!(data.len() - 1, new_data.len());
2251        assert_eq!(1, new_data.offset());
2252        assert_eq!(data.null_count(), new_data.null_count());
2253
2254        // slice of a slice (removes one null)
2255        let new_data = new_data.slice(1, 14);
2256        assert_eq!(data.len() - 2, new_data.len());
2257        assert_eq!(2, new_data.offset());
2258        assert_eq!(data.null_count() - 1, new_data.null_count());
2259    }
2260
2261    #[test]
2262    fn test_equality() {
2263        let int_data = ArrayData::builder(DataType::Int32)
2264            .len(1)
2265            .add_buffer(make_i32_buffer(1))
2266            .build()
2267            .unwrap();
2268
2269        let float_data = ArrayData::builder(DataType::Float32)
2270            .len(1)
2271            .add_buffer(make_f32_buffer(1))
2272            .build()
2273            .unwrap();
2274        assert_ne!(int_data, float_data);
2275        assert!(!int_data.ptr_eq(&float_data));
2276        assert!(int_data.ptr_eq(&int_data));
2277
2278        #[allow(clippy::redundant_clone)]
2279        let int_data_clone = int_data.clone();
2280        assert_eq!(int_data, int_data_clone);
2281        assert!(int_data.ptr_eq(&int_data_clone));
2282        assert!(int_data_clone.ptr_eq(&int_data));
2283
2284        let int_data_slice = int_data_clone.slice(1, 0);
2285        assert!(int_data_slice.ptr_eq(&int_data_slice));
2286        assert!(!int_data.ptr_eq(&int_data_slice));
2287        assert!(!int_data_slice.ptr_eq(&int_data));
2288
2289        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2290        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2291        let string_data = ArrayData::try_new(
2292            DataType::Utf8,
2293            3,
2294            Some(Buffer::from_iter(vec![true, false, true])),
2295            0,
2296            vec![offsets_buffer, data_buffer],
2297            vec![],
2298        )
2299        .unwrap();
2300
2301        assert_ne!(float_data, string_data);
2302        assert!(!float_data.ptr_eq(&string_data));
2303
2304        assert!(string_data.ptr_eq(&string_data));
2305
2306        #[allow(clippy::redundant_clone)]
2307        let string_data_cloned = string_data.clone();
2308        assert!(string_data_cloned.ptr_eq(&string_data));
2309        assert!(string_data.ptr_eq(&string_data_cloned));
2310
2311        let string_data_slice = string_data.slice(1, 2);
2312        assert!(string_data_slice.ptr_eq(&string_data_slice));
2313        assert!(!string_data_slice.ptr_eq(&string_data))
2314    }
2315
2316    #[test]
2317    fn test_slice_memory_size() {
2318        let mut bit_v: [u8; 2] = [0; 2];
2319        bit_util::set_bit(&mut bit_v, 0);
2320        bit_util::set_bit(&mut bit_v, 3);
2321        bit_util::set_bit(&mut bit_v, 10);
2322        let data = ArrayData::builder(DataType::Int32)
2323            .len(16)
2324            .add_buffer(make_i32_buffer(16))
2325            .null_bit_buffer(Some(Buffer::from(bit_v)))
2326            .build()
2327            .unwrap();
2328        let new_data = data.slice(1, 14);
2329        assert_eq!(
2330            data.get_slice_memory_size().unwrap() - 8,
2331            new_data.get_slice_memory_size().unwrap()
2332        );
2333        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2334        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2335        let string_data = ArrayData::try_new(
2336            DataType::Utf8,
2337            3,
2338            Some(Buffer::from_iter(vec![true, false, true])),
2339            0,
2340            vec![offsets_buffer, data_buffer],
2341            vec![],
2342        )
2343        .unwrap();
2344        let string_data_slice = string_data.slice(1, 2);
2345        //4 bytes of offset and 2 bytes of data reduced by slicing.
2346        assert_eq!(
2347            string_data.get_slice_memory_size().unwrap() - 6,
2348            string_data_slice.get_slice_memory_size().unwrap()
2349        );
2350    }
2351
2352    #[test]
2353    fn test_count_nulls() {
2354        let buffer = Buffer::from([0b00010110, 0b10011111]);
2355        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2356        let count = count_nulls(Some(&buffer), 0, 16);
2357        assert_eq!(count, 7);
2358
2359        let count = count_nulls(Some(&buffer), 4, 8);
2360        assert_eq!(count, 3);
2361    }
2362
2363    #[test]
2364    fn test_contains_nulls() {
2365        let buffer: Buffer =
2366            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2367        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2368        assert!(contains_nulls(Some(&buffer), 0, 6));
2369        assert!(contains_nulls(Some(&buffer), 0, 3));
2370        assert!(!contains_nulls(Some(&buffer), 3, 2));
2371        assert!(!contains_nulls(Some(&buffer), 0, 0));
2372    }
2373
2374    #[test]
2375    fn test_alignment() {
2376        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2377        let sliced = buffer.slice(1);
2378
2379        let mut data = ArrayData {
2380            data_type: DataType::Int32,
2381            len: 0,
2382            offset: 0,
2383            buffers: vec![buffer],
2384            child_data: vec![],
2385            nulls: None,
2386        };
2387        data.validate_full().unwrap();
2388
2389        // break alignment in data
2390        data.buffers[0] = sliced;
2391        let err = data.validate().unwrap_err();
2392
2393        assert_eq!(
2394            err.to_string(),
2395            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2396        );
2397
2398        data.align_buffers();
2399        data.validate_full().unwrap();
2400    }
2401
2402    #[test]
2403    fn test_alignment_struct() {
2404        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2405        let sliced = buffer.slice(1);
2406
2407        let child_data = ArrayData {
2408            data_type: DataType::Int32,
2409            len: 0,
2410            offset: 0,
2411            buffers: vec![buffer],
2412            child_data: vec![],
2413            nulls: None,
2414        };
2415
2416        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2417        let mut data = ArrayData {
2418            data_type: schema,
2419            len: 0,
2420            offset: 0,
2421            buffers: vec![],
2422            child_data: vec![child_data],
2423            nulls: None,
2424        };
2425        data.validate_full().unwrap();
2426
2427        // break alignment in child data
2428        data.child_data[0].buffers[0] = sliced;
2429        let err = data.validate().unwrap_err();
2430
2431        assert_eq!(
2432            err.to_string(),
2433            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2434        );
2435
2436        data.align_buffers();
2437        data.validate_full().unwrap();
2438    }
2439
2440    #[test]
2441    fn test_null_view_types() {
2442        let array_len = 32;
2443        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2444        assert_eq!(array.len(), array_len);
2445        for i in 0..array.len() {
2446            assert!(array.is_null(i));
2447        }
2448
2449        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2450        assert_eq!(array.len(), array_len);
2451        for i in 0..array.len() {
2452            assert!(array.is_null(i));
2453        }
2454    }
2455}
arrow_data/data.rs

arrow_data/
data.rs