arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal128(_, _)
87        | DataType::Decimal256(_, _)
88        | DataType::Date32
89        | DataType::Time32(_)
90        | DataType::Date64
91        | DataType::Time64(_)
92        | DataType::Duration(_)
93        | DataType::Timestamp(_, _)
94        | DataType::Interval(_) => [
95            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
96            empty_buffer,
97        ],
98        DataType::Utf8 | DataType::Binary => {
99            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
100            // safety: `unsafe` code assumes that this buffer is initialized with one element
101            buffer.push(0i32);
102            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
103        }
104        DataType::LargeUtf8 | DataType::LargeBinary => {
105            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
106            // safety: `unsafe` code assumes that this buffer is initialized with one element
107            buffer.push(0i64);
108            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
109        }
110        DataType::BinaryView | DataType::Utf8View => [
111            MutableBuffer::new(capacity * mem::size_of::<u128>()),
112            empty_buffer,
113        ],
114        DataType::List(_) | DataType::Map(_, _) => {
115            // offset buffer always starts with a zero
116            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
117            buffer.push(0i32);
118            [buffer, empty_buffer]
119        }
120        DataType::ListView(_) => [
121            MutableBuffer::new(capacity * mem::size_of::<i32>()),
122            MutableBuffer::new(capacity * mem::size_of::<i32>()),
123        ],
124        DataType::LargeList(_) => {
125            // offset buffer always starts with a zero
126            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
127            buffer.push(0i64);
128            [buffer, empty_buffer]
129        }
130        DataType::LargeListView(_) => [
131            MutableBuffer::new(capacity * mem::size_of::<i64>()),
132            MutableBuffer::new(capacity * mem::size_of::<i64>()),
133        ],
134        DataType::FixedSizeBinary(size) => {
135            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
136        }
137        DataType::Dictionary(k, _) => [
138            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
139            empty_buffer,
140        ],
141        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
142            [empty_buffer, MutableBuffer::new(0)]
143        }
144        DataType::Union(_, mode) => {
145            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
146            match mode {
147                UnionMode::Sparse => [type_ids, empty_buffer],
148                UnionMode::Dense => {
149                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
150                    [type_ids, offsets]
151                }
152            }
153        }
154    }
155}
156
157/// A generic representation of Arrow array data which encapsulates common attributes
158/// and operations for Arrow array.
159///
160/// Specific operations for different arrays types (e.g., primitive, list, struct)
161/// are implemented in `Array`.
162///
163/// # Memory Layout
164///
165/// `ArrayData` has references to one or more underlying data buffers
166/// and optional child ArrayData, depending on type as illustrated
167/// below. Bitmaps are not shown for simplicity but they are stored
168/// similarly to the buffers.
169///
170/// ```text
171///                        offset
172///                       points to
173/// ┌───────────────────┐ start of  ┌───────┐       Different
174/// │                   │   data    │       │     ArrayData may
175/// │ArrayData {        │           │....   │     also refers to
176/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
177/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
178/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
179/// │  buffers: [       │           │5882   │◀─
180/// │    ...            │  │        │4323   │
181/// │  ]                │   ─ ─ ─ ─▶│4859   │
182/// │  child_data: [    │           │....   │
183/// │    ...            │           │       │
184/// │  ]                │           └───────┘
185/// │}                  │
186/// │                   │            Shared Buffer uses
187/// │               │   │            bytes::Bytes to hold
188/// └───────────────────┘            actual data values
189///           ┌ ─ ─ ┘
190///
191///           ▼
192/// ┌───────────────────┐
193/// │ArrayData {        │
194/// │  ...              │
195/// │}                  │
196/// │                   │
197/// └───────────────────┘
198///
199/// Child ArrayData may also have its own buffers and children
200/// ```
201
202#[derive(Debug, Clone)]
203pub struct ArrayData {
204    /// The data type for this array data
205    data_type: DataType,
206
207    /// The number of elements in this array data
208    len: usize,
209
210    /// The offset into this array data, in number of items
211    offset: usize,
212
213    /// The buffers for this array data. Note that depending on the array types, this
214    /// could hold different kinds of buffers (e.g., value buffer, value offset buffer)
215    /// at different positions.
216    buffers: Vec<Buffer>,
217
218    /// The child(ren) of this array. Only non-empty for nested types, currently
219    /// `ListArray` and `StructArray`.
220    child_data: Vec<ArrayData>,
221
222    /// The null bitmap. A `None` value for this indicates all values are non-null in
223    /// this array.
224    nulls: Option<NullBuffer>,
225}
226
227/// A thread-safe, shared reference to the Arrow array data.
228pub type ArrayDataRef = Arc<ArrayData>;
229
230impl ArrayData {
231    /// Create a new ArrayData instance;
232    ///
233    /// If `null_count` is not specified, the number of nulls in
234    /// null_bit_buffer is calculated.
235    ///
236    /// If the number of nulls is 0 then the null_bit_buffer
237    /// is set to `None`.
238    ///
239    /// # Safety
240    ///
241    /// The input values *must* form a valid Arrow array for
242    /// `data_type`, or undefined behavior can result.
243    ///
244    /// Note: This is a low level API and most users of the arrow
245    /// crate should create arrays using the methods in the `array`
246    /// module.
247    pub unsafe fn new_unchecked(
248        data_type: DataType,
249        len: usize,
250        null_count: Option<usize>,
251        null_bit_buffer: Option<Buffer>,
252        offset: usize,
253        buffers: Vec<Buffer>,
254        child_data: Vec<ArrayData>,
255    ) -> Self {
256        let mut skip_validation = UnsafeFlag::new();
257        // SAFETY: caller responsible for ensuring data is valid
258        skip_validation.set(true);
259
260        ArrayDataBuilder {
261            data_type,
262            len,
263            null_count,
264            null_bit_buffer,
265            nulls: None,
266            offset,
267            buffers,
268            child_data,
269            align_buffers: false,
270            skip_validation,
271        }
272        .build()
273        .unwrap()
274    }
275
276    /// Create a new ArrayData, validating that the provided buffers form a valid
277    /// Arrow array of the specified data type.
278    ///
279    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
280    /// is set to `None`.
281    ///
282    /// Internally this calls through to [`Self::validate_data`]
283    ///
284    /// Note: This is a low level API and most users of the arrow crate should create
285    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
286    pub fn try_new(
287        data_type: DataType,
288        len: usize,
289        null_bit_buffer: Option<Buffer>,
290        offset: usize,
291        buffers: Vec<Buffer>,
292        child_data: Vec<ArrayData>,
293    ) -> Result<Self, ArrowError> {
294        // we must check the length of `null_bit_buffer` first
295        // because we use this buffer to calculate `null_count`
296        // in `Self::new_unchecked`.
297        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
298            let needed_len = bit_util::ceil(len + offset, 8);
299            if null_bit_buffer.len() < needed_len {
300                return Err(ArrowError::InvalidArgumentError(format!(
301                    "null_bit_buffer size too small. got {} needed {}",
302                    null_bit_buffer.len(),
303                    needed_len
304                )));
305            }
306        }
307        // Safety justification: `validate_full` is called below
308        let new_self = unsafe {
309            Self::new_unchecked(
310                data_type,
311                len,
312                None,
313                null_bit_buffer,
314                offset,
315                buffers,
316                child_data,
317            )
318        };
319
320        // As the data is not trusted, do a full validation of its contents
321        // We don't need to validate children as we can assume that the
322        // [`ArrayData`] in `child_data` have already been validated through
323        // a call to `ArrayData::try_new` or created using unsafe
324        new_self.validate_data()?;
325        Ok(new_self)
326    }
327
328    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
329    #[inline]
330    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
331        ArrayDataBuilder::new(data_type)
332    }
333
334    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
335    #[inline]
336    pub const fn data_type(&self) -> &DataType {
337        &self.data_type
338    }
339
340    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
341    pub fn buffers(&self) -> &[Buffer] {
342        &self.buffers
343    }
344
345    /// Returns a slice of children [`ArrayData`]. This will be non
346    /// empty for type such as lists and structs.
347    pub fn child_data(&self) -> &[ArrayData] {
348        &self.child_data[..]
349    }
350
351    /// Returns whether the element at index `i` is null
352    #[inline]
353    pub fn is_null(&self, i: usize) -> bool {
354        match &self.nulls {
355            Some(v) => v.is_null(i),
356            None => false,
357        }
358    }
359
360    /// Returns a reference to the null buffer of this [`ArrayData`] if any
361    ///
362    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
363    #[inline]
364    pub fn nulls(&self) -> Option<&NullBuffer> {
365        self.nulls.as_ref()
366    }
367
368    /// Returns whether the element at index `i` is not null
369    #[inline]
370    pub fn is_valid(&self, i: usize) -> bool {
371        !self.is_null(i)
372    }
373
374    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
375    #[inline]
376    pub const fn len(&self) -> usize {
377        self.len
378    }
379
380    /// Returns whether this [`ArrayData`] is empty
381    #[inline]
382    pub const fn is_empty(&self) -> bool {
383        self.len == 0
384    }
385
386    /// Returns the offset of this [`ArrayData`]
387    #[inline]
388    pub const fn offset(&self) -> usize {
389        self.offset
390    }
391
392    /// Returns the total number of nulls in this array
393    #[inline]
394    pub fn null_count(&self) -> usize {
395        self.nulls
396            .as_ref()
397            .map(|x| x.null_count())
398            .unwrap_or_default()
399    }
400
401    /// Returns the total number of bytes of memory occupied by the
402    /// buffers owned by this [`ArrayData`] and all of its
403    /// children. (See also diagram on [`ArrayData`]).
404    ///
405    /// Note that this [`ArrayData`] may only refer to a subset of the
406    /// data in the underlying [`Buffer`]s (due to `offset` and
407    /// `length`), but the size returned includes the entire size of
408    /// the buffers.
409    ///
410    /// If multiple [`ArrayData`]s refer to the same underlying
411    /// [`Buffer`]s they will both report the same size.
412    pub fn get_buffer_memory_size(&self) -> usize {
413        let mut size = 0;
414        for buffer in &self.buffers {
415            size += buffer.capacity();
416        }
417        if let Some(bitmap) = &self.nulls {
418            size += bitmap.buffer().capacity()
419        }
420        for child in &self.child_data {
421            size += child.get_buffer_memory_size();
422        }
423        size
424    }
425
426    /// Returns the total number of the bytes of memory occupied by
427    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
428    ///
429    /// This is approximately the number of bytes if a new
430    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
431    /// exactly the data needed.
432    ///
433    /// For example, a [`DataType::Int64`] with `100` elements,
434    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
435    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
436    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
437    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
438    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
439        let mut result: usize = 0;
440        let layout = layout(&self.data_type);
441
442        for spec in layout.buffers.iter() {
443            match spec {
444                BufferSpec::FixedWidth { byte_width, .. } => {
445                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
446                        ArrowError::ComputeError(
447                            "Integer overflow computing buffer size".to_string(),
448                        )
449                    })?;
450                    result += buffer_size;
451                }
452                BufferSpec::VariableWidth => {
453                    let buffer_len: usize;
454                    match self.data_type {
455                        DataType::Utf8 | DataType::Binary => {
456                            let offsets = self.typed_offsets::<i32>()?;
457                            buffer_len = (offsets[self.len] - offsets[0] ) as usize;
458                        }
459                        DataType::LargeUtf8 | DataType::LargeBinary => {
460                            let offsets = self.typed_offsets::<i64>()?;
461                            buffer_len = (offsets[self.len] - offsets[0]) as usize;
462                        }
463                        _ => {
464                            return Err(ArrowError::NotYetImplemented(format!(
465                            "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
466                            self.data_type
467                            )))
468                        }
469                    };
470                    result += buffer_len;
471                }
472                BufferSpec::BitMap => {
473                    let buffer_size = bit_util::ceil(self.len, 8);
474                    result += buffer_size;
475                }
476                BufferSpec::AlwaysNull => {
477                    // Nothing to do
478                }
479            }
480        }
481
482        if self.nulls().is_some() {
483            result += bit_util::ceil(self.len, 8);
484        }
485
486        for child in &self.child_data {
487            result += child.get_slice_memory_size()?;
488        }
489        Ok(result)
490    }
491
492    /// Returns the total number of bytes of memory occupied
493    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
494    /// children. (See also diagram on [`ArrayData`]).
495    ///
496    /// Equivalent to:
497    ///  `size_of_val(self)` +
498    ///  [`Self::get_buffer_memory_size`] +
499    ///  `size_of_val(child)` for all children
500    pub fn get_array_memory_size(&self) -> usize {
501        let mut size = mem::size_of_val(self);
502
503        // Calculate rest of the fields top down which contain actual data
504        for buffer in &self.buffers {
505            size += mem::size_of::<Buffer>();
506            size += buffer.capacity();
507        }
508        if let Some(nulls) = &self.nulls {
509            size += nulls.buffer().capacity();
510        }
511        for child in &self.child_data {
512            size += child.get_array_memory_size();
513        }
514
515        size
516    }
517
518    /// Creates a zero-copy slice of itself. This creates a new
519    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
520    /// different offset and len
521    ///
522    /// # Panics
523    ///
524    /// Panics if `offset + length > self.len()`.
525    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
526        assert!((offset + length) <= self.len());
527
528        if let DataType::Struct(_) = self.data_type() {
529            // Slice into children
530            let new_offset = self.offset + offset;
531            let new_data = ArrayData {
532                data_type: self.data_type().clone(),
533                len: length,
534                offset: new_offset,
535                buffers: self.buffers.clone(),
536                // Slice child data, to propagate offsets down to them
537                child_data: self
538                    .child_data()
539                    .iter()
540                    .map(|data| data.slice(offset, length))
541                    .collect(),
542                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
543            };
544
545            new_data
546        } else {
547            let mut new_data = self.clone();
548
549            new_data.len = length;
550            new_data.offset = offset + self.offset;
551            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
552
553            new_data
554        }
555    }
556
557    /// Returns the `buffer` as a slice of type `T` starting at self.offset
558    /// # Panics
559    /// This function panics if:
560    /// * the buffer is not byte-aligned with type T, or
561    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
562    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
563        &self.buffers()[buffer].typed_data()[self.offset..]
564    }
565
566    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
567    pub fn new_null(data_type: &DataType, len: usize) -> Self {
568        let bit_len = bit_util::ceil(len, 8);
569        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
570
571        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
572            Some(width) => (vec![zeroed(width * len)], vec![], true),
573            None => match data_type {
574                DataType::Null => (vec![], vec![], false),
575                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
576                DataType::Binary | DataType::Utf8 => {
577                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
578                }
579                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
580                DataType::LargeBinary | DataType::LargeUtf8 => {
581                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
582                }
583                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
584                DataType::List(f) | DataType::Map(f, _) => (
585                    vec![zeroed((len + 1) * 4)],
586                    vec![ArrayData::new_empty(f.data_type())],
587                    true,
588                ),
589                DataType::LargeList(f) => (
590                    vec![zeroed((len + 1) * 8)],
591                    vec![ArrayData::new_empty(f.data_type())],
592                    true,
593                ),
594                DataType::FixedSizeList(f, list_len) => (
595                    vec![],
596                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
597                    true,
598                ),
599                DataType::Struct(fields) => (
600                    vec![],
601                    fields
602                        .iter()
603                        .map(|f| Self::new_null(f.data_type(), len))
604                        .collect(),
605                    true,
606                ),
607                DataType::Dictionary(k, v) => (
608                    vec![zeroed(k.primitive_width().unwrap() * len)],
609                    vec![ArrayData::new_empty(v.as_ref())],
610                    true,
611                ),
612                DataType::Union(f, mode) => {
613                    let (id, _) = f.iter().next().unwrap();
614                    let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
615                    let buffers = match mode {
616                        UnionMode::Sparse => vec![ids],
617                        UnionMode::Dense => {
618                            let end_offset = i32::from_usize(len).unwrap();
619                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
620                        }
621                    };
622
623                    let children = f
624                        .iter()
625                        .enumerate()
626                        .map(|(idx, (_, f))| {
627                            if idx == 0 || *mode == UnionMode::Sparse {
628                                Self::new_null(f.data_type(), len)
629                            } else {
630                                Self::new_empty(f.data_type())
631                            }
632                        })
633                        .collect();
634
635                    (buffers, children, false)
636                }
637                DataType::RunEndEncoded(r, v) => {
638                    let runs = match r.data_type() {
639                        DataType::Int16 => {
640                            let i = i16::from_usize(len).expect("run overflow");
641                            Buffer::from_slice_ref([i])
642                        }
643                        DataType::Int32 => {
644                            let i = i32::from_usize(len).expect("run overflow");
645                            Buffer::from_slice_ref([i])
646                        }
647                        DataType::Int64 => {
648                            let i = i64::from_usize(len).expect("run overflow");
649                            Buffer::from_slice_ref([i])
650                        }
651                        dt => unreachable!("Invalid run ends data type {dt}"),
652                    };
653
654                    let builder = ArrayData::builder(r.data_type().clone())
655                        .len(1)
656                        .buffers(vec![runs]);
657
658                    // SAFETY:
659                    // Valid by construction
660                    let runs = unsafe { builder.build_unchecked() };
661                    (
662                        vec![],
663                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
664                        false,
665                    )
666                }
667                d => unreachable!("{d}"),
668            },
669        };
670
671        let mut builder = ArrayDataBuilder::new(data_type.clone())
672            .len(len)
673            .buffers(buffers)
674            .child_data(child_data);
675
676        if has_nulls {
677            builder = builder.nulls(Some(NullBuffer::new_null(len)))
678        }
679
680        // SAFETY:
681        // Data valid by construction
682        unsafe { builder.build_unchecked() }
683    }
684
685    /// Returns a new empty [ArrayData] valid for `data_type`.
686    pub fn new_empty(data_type: &DataType) -> Self {
687        Self::new_null(data_type, 0)
688    }
689
690    /// Verifies that the buffers meet the minimum alignment requirements for the data type
691    ///
692    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
693    ///
694    /// This can be useful for when interacting with data sent over IPC or FFI, that may
695    /// not meet the minimum alignment requirements
696    ///
697    /// This also aligns buffers of children data
698    pub fn align_buffers(&mut self) {
699        let layout = layout(&self.data_type);
700        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
701            if let BufferSpec::FixedWidth { alignment, .. } = spec {
702                if buffer.as_ptr().align_offset(*alignment) != 0 {
703                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
704                }
705            }
706        }
707        // align children data recursively
708        for data in self.child_data.iter_mut() {
709            data.align_buffers()
710        }
711    }
712
713    /// "cheap" validation of an `ArrayData`. Ensures buffers are
714    /// sufficiently sized to store `len` + `offset` total elements of
715    /// `data_type` and performs other inexpensive consistency checks.
716    ///
717    /// This check is "cheap" in the sense that it does not validate the
718    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
719    /// are within the bounds of the values buffer).
720    ///
721    /// See [ArrayData::validate_data] to validate fully the offset content
722    /// and the validity of utf8 data
723    pub fn validate(&self) -> Result<(), ArrowError> {
724        // Need at least this mich space in each buffer
725        let len_plus_offset = self.len + self.offset;
726
727        // Check that the data layout conforms to the spec
728        let layout = layout(&self.data_type);
729
730        if !layout.can_contain_null_mask && self.nulls.is_some() {
731            return Err(ArrowError::InvalidArgumentError(format!(
732                "Arrays of type {:?} cannot contain a null bitmask",
733                self.data_type,
734            )));
735        }
736
737        // Check data buffers length for view types and other types
738        if self.buffers.len() < layout.buffers.len()
739            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
740        {
741            return Err(ArrowError::InvalidArgumentError(format!(
742                "Expected {} buffers in array of type {:?}, got {}",
743                layout.buffers.len(),
744                self.data_type,
745                self.buffers.len(),
746            )));
747        }
748
749        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
750            match spec {
751                BufferSpec::FixedWidth {
752                    byte_width,
753                    alignment,
754                } => {
755                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
756
757                    if buffer.len() < min_buffer_size {
758                        return Err(ArrowError::InvalidArgumentError(format!(
759                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
760                            min_buffer_size, i, self.data_type, buffer.len()
761                        )));
762                    }
763
764                    let align_offset = buffer.as_ptr().align_offset(*alignment);
765                    if align_offset != 0 {
766                        return Err(ArrowError::InvalidArgumentError(format!(
767                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
768                            self.data_type, align_offset.min(alignment - align_offset)
769                        )));
770                    }
771                }
772                BufferSpec::VariableWidth => {
773                    // not cheap to validate (need to look at the
774                    // data). Partially checked in validate_offsets
775                    // called below. Can check with `validate_full`
776                }
777                BufferSpec::BitMap => {
778                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
779                    if buffer.len() < min_buffer_size {
780                        return Err(ArrowError::InvalidArgumentError(format!(
781                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
782                            min_buffer_size, i, self.data_type, buffer.len()
783                        )));
784                    }
785                }
786                BufferSpec::AlwaysNull => {
787                    // Nothing to validate
788                }
789            }
790        }
791
792        // check null bit buffer size
793        if let Some(nulls) = self.nulls() {
794            if nulls.null_count() > self.len {
795                return Err(ArrowError::InvalidArgumentError(format!(
796                    "null_count {} for an array exceeds length of {} elements",
797                    nulls.null_count(),
798                    self.len
799                )));
800            }
801
802            let actual_len = nulls.validity().len();
803            let needed_len = bit_util::ceil(len_plus_offset, 8);
804            if actual_len < needed_len {
805                return Err(ArrowError::InvalidArgumentError(format!(
806                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
807                )));
808            }
809
810            if nulls.len() != self.len {
811                return Err(ArrowError::InvalidArgumentError(format!(
812                    "null buffer incorrect size. got {} expected {}",
813                    nulls.len(),
814                    self.len
815                )));
816            }
817        }
818
819        self.validate_child_data()?;
820
821        // Additional Type specific checks
822        match &self.data_type {
823            DataType::Utf8 | DataType::Binary => {
824                self.validate_offsets::<i32>(self.buffers[1].len())?;
825            }
826            DataType::LargeUtf8 | DataType::LargeBinary => {
827                self.validate_offsets::<i64>(self.buffers[1].len())?;
828            }
829            DataType::Dictionary(key_type, _value_type) => {
830                // At the moment, constructing a DictionaryArray will also check this
831                if !DataType::is_dictionary_key_type(key_type) {
832                    return Err(ArrowError::InvalidArgumentError(format!(
833                        "Dictionary key type must be integer, but was {key_type}"
834                    )));
835                }
836            }
837            DataType::RunEndEncoded(run_ends_type, _) => {
838                if run_ends_type.is_nullable() {
839                    return Err(ArrowError::InvalidArgumentError(
840                        "The nullable should be set to false for the field defining run_ends array.".to_string()
841                    ));
842                }
843                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
844                    return Err(ArrowError::InvalidArgumentError(format!(
845                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
846                        run_ends_type.data_type()
847                    )));
848                }
849            }
850            _ => {}
851        };
852
853        Ok(())
854    }
855
856    /// Returns a reference to the data in `buffer` as a typed slice
857    /// (typically `&[i32]` or `&[i64]`) after validating. The
858    /// returned slice is guaranteed to have at least `self.len + 1`
859    /// entries.
860    ///
861    /// For an empty array, the `buffer` can also be empty.
862    fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
863        // An empty list-like array can have 0 offsets
864        if self.len == 0 && self.buffers[0].is_empty() {
865            return Ok(&[]);
866        }
867
868        self.typed_buffer(0, self.len + 1)
869    }
870
871    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
872    fn typed_buffer<T: ArrowNativeType + num::Num>(
873        &self,
874        idx: usize,
875        len: usize,
876    ) -> Result<&[T], ArrowError> {
877        let buffer = &self.buffers[idx];
878
879        let required_len = (len + self.offset) * mem::size_of::<T>();
880
881        if buffer.len() < required_len {
882            return Err(ArrowError::InvalidArgumentError(format!(
883                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
884                idx,
885                self.data_type,
886                required_len,
887                buffer.len()
888            )));
889        }
890
891        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
892    }
893
894    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
895    /// offsets (of type T) into some other buffer of `values_length` bytes long
896    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
897        &self,
898        values_length: usize,
899    ) -> Result<(), ArrowError> {
900        // Justification: buffer size was validated above
901        let offsets = self.typed_offsets::<T>()?;
902        if offsets.is_empty() {
903            return Ok(());
904        }
905
906        let first_offset = offsets[0].to_usize().ok_or_else(|| {
907            ArrowError::InvalidArgumentError(format!(
908                "Error converting offset[0] ({}) to usize for {}",
909                offsets[0], self.data_type
910            ))
911        })?;
912
913        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
914            ArrowError::InvalidArgumentError(format!(
915                "Error converting offset[{}] ({}) to usize for {}",
916                self.len, offsets[self.len], self.data_type
917            ))
918        })?;
919
920        if first_offset > values_length {
921            return Err(ArrowError::InvalidArgumentError(format!(
922                "First offset {} of {} is larger than values length {}",
923                first_offset, self.data_type, values_length,
924            )));
925        }
926
927        if last_offset > values_length {
928            return Err(ArrowError::InvalidArgumentError(format!(
929                "Last offset {} of {} is larger than values length {}",
930                last_offset, self.data_type, values_length,
931            )));
932        }
933
934        if first_offset > last_offset {
935            return Err(ArrowError::InvalidArgumentError(format!(
936                "First offset {} in {} is smaller than last offset {}",
937                first_offset, self.data_type, last_offset,
938            )));
939        }
940
941        Ok(())
942    }
943
944    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
945    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
946    fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
947        &self,
948        values_length: usize,
949    ) -> Result<(), ArrowError> {
950        let offsets: &[T] = self.typed_buffer(0, self.len)?;
951        let sizes: &[T] = self.typed_buffer(1, self.len)?;
952        for i in 0..values_length {
953            let size = sizes[i].to_usize().ok_or_else(|| {
954                ArrowError::InvalidArgumentError(format!(
955                    "Error converting size[{}] ({}) to usize for {}",
956                    i, sizes[i], self.data_type
957                ))
958            })?;
959            let offset = offsets[i].to_usize().ok_or_else(|| {
960                ArrowError::InvalidArgumentError(format!(
961                    "Error converting offset[{}] ({}) to usize for {}",
962                    i, offsets[i], self.data_type
963                ))
964            })?;
965            if size
966                .checked_add(offset)
967                .expect("Offset and size have exceeded the usize boundary")
968                > values_length
969            {
970                return Err(ArrowError::InvalidArgumentError(format!(
971                    "Size {} at index {} is larger than the remaining values for {}",
972                    size, i, self.data_type
973                )));
974            }
975        }
976        Ok(())
977    }
978
979    /// Validates the layout of `child_data` ArrayData structures
980    fn validate_child_data(&self) -> Result<(), ArrowError> {
981        match &self.data_type {
982            DataType::List(field) | DataType::Map(field, _) => {
983                let values_data = self.get_single_valid_child_data(field.data_type())?;
984                self.validate_offsets::<i32>(values_data.len)?;
985                Ok(())
986            }
987            DataType::LargeList(field) => {
988                let values_data = self.get_single_valid_child_data(field.data_type())?;
989                self.validate_offsets::<i64>(values_data.len)?;
990                Ok(())
991            }
992            DataType::ListView(field) => {
993                let values_data = self.get_single_valid_child_data(field.data_type())?;
994                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
995                Ok(())
996            }
997            DataType::LargeListView(field) => {
998                let values_data = self.get_single_valid_child_data(field.data_type())?;
999                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1000                Ok(())
1001            }
1002            DataType::FixedSizeList(field, list_size) => {
1003                let values_data = self.get_single_valid_child_data(field.data_type())?;
1004
1005                let list_size: usize = (*list_size).try_into().map_err(|_| {
1006                    ArrowError::InvalidArgumentError(format!(
1007                        "{} has a negative list_size {}",
1008                        self.data_type, list_size
1009                    ))
1010                })?;
1011
1012                let expected_values_len = self.len
1013                    .checked_mul(list_size)
1014                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1015
1016                if values_data.len < expected_values_len {
1017                    return Err(ArrowError::InvalidArgumentError(format!(
1018                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1019                        values_data.len, list_size, list_size, self.data_type
1020                    )));
1021                }
1022
1023                Ok(())
1024            }
1025            DataType::Struct(fields) => {
1026                self.validate_num_child_data(fields.len())?;
1027                for (i, field) in fields.iter().enumerate() {
1028                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1029
1030                    // Ensure child field has sufficient size
1031                    if field_data.len < self.len {
1032                        return Err(ArrowError::InvalidArgumentError(format!(
1033                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1034                            self.data_type, i, field.name(), field_data.len, self.len
1035                        )));
1036                    }
1037                }
1038                Ok(())
1039            }
1040            DataType::RunEndEncoded(run_ends_field, values_field) => {
1041                self.validate_num_child_data(2)?;
1042                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1043                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1044                if run_ends_data.len != values_data.len {
1045                    return Err(ArrowError::InvalidArgumentError(format!(
1046                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1047                        run_ends_data.len, values_data.len
1048                    )));
1049                }
1050                if run_ends_data.nulls.is_some() {
1051                    return Err(ArrowError::InvalidArgumentError(
1052                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1053                    ));
1054                }
1055                Ok(())
1056            }
1057            DataType::Union(fields, mode) => {
1058                self.validate_num_child_data(fields.len())?;
1059
1060                for (i, (_, field)) in fields.iter().enumerate() {
1061                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1062
1063                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1064                        return Err(ArrowError::InvalidArgumentError(format!(
1065                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1066                            i, field_data.len, self.len + self.offset
1067                        )));
1068                    }
1069                }
1070                Ok(())
1071            }
1072            DataType::Dictionary(_key_type, value_type) => {
1073                self.get_single_valid_child_data(value_type)?;
1074                Ok(())
1075            }
1076            _ => {
1077                // other types do not have child data
1078                if !self.child_data.is_empty() {
1079                    return Err(ArrowError::InvalidArgumentError(format!(
1080                        "Expected no child arrays for type {} but got {}",
1081                        self.data_type,
1082                        self.child_data.len()
1083                    )));
1084                }
1085                Ok(())
1086            }
1087        }
1088    }
1089
1090    /// Ensures that this array data has a single child_data with the
1091    /// expected type, and calls `validate()` on it. Returns a
1092    /// reference to that child_data
1093    fn get_single_valid_child_data(
1094        &self,
1095        expected_type: &DataType,
1096    ) -> Result<&ArrayData, ArrowError> {
1097        self.validate_num_child_data(1)?;
1098        self.get_valid_child_data(0, expected_type)
1099    }
1100
1101    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1102    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1103        if self.child_data.len() != expected_len {
1104            Err(ArrowError::InvalidArgumentError(format!(
1105                "Value data for {} should contain {} child data array(s), had {}",
1106                self.data_type,
1107                expected_len,
1108                self.child_data.len()
1109            )))
1110        } else {
1111            Ok(())
1112        }
1113    }
1114
1115    /// Ensures that `child_data[i]` has the expected type, calls
1116    /// `validate()` on it, and returns a reference to that child_data
1117    fn get_valid_child_data(
1118        &self,
1119        i: usize,
1120        expected_type: &DataType,
1121    ) -> Result<&ArrayData, ArrowError> {
1122        let values_data = self.child_data.get(i).ok_or_else(|| {
1123            ArrowError::InvalidArgumentError(format!(
1124                "{} did not have enough child arrays. Expected at least {} but had only {}",
1125                self.data_type,
1126                i + 1,
1127                self.child_data.len()
1128            ))
1129        })?;
1130
1131        if expected_type != &values_data.data_type {
1132            return Err(ArrowError::InvalidArgumentError(format!(
1133                "Child type mismatch for {}. Expected {} but child data had {}",
1134                self.data_type, expected_type, values_data.data_type
1135            )));
1136        }
1137
1138        values_data.validate()?;
1139        Ok(values_data)
1140    }
1141
1142    /// Validate that the data contained within this [`ArrayData`] is valid
1143    ///
1144    /// 1. Null count is correct
1145    /// 2. All offsets are valid
1146    /// 3. All String data is valid UTF-8
1147    /// 4. All dictionary offsets are valid
1148    ///
1149    /// Internally this calls:
1150    ///
1151    /// * [`Self::validate`]
1152    /// * [`Self::validate_nulls`]
1153    /// * [`Self::validate_values`]
1154    ///
1155    /// Note: this does not recurse into children, for a recursive variant
1156    /// see [`Self::validate_full`]
1157    pub fn validate_data(&self) -> Result<(), ArrowError> {
1158        self.validate()?;
1159
1160        self.validate_nulls()?;
1161        self.validate_values()?;
1162        Ok(())
1163    }
1164
1165    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1166    ///
1167    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1168    /// and all its children recursively
1169    pub fn validate_full(&self) -> Result<(), ArrowError> {
1170        self.validate_data()?;
1171        // validate all children recursively
1172        self.child_data
1173            .iter()
1174            .enumerate()
1175            .try_for_each(|(i, child_data)| {
1176                child_data.validate_full().map_err(|e| {
1177                    ArrowError::InvalidArgumentError(format!(
1178                        "{} child #{} invalid: {}",
1179                        self.data_type, i, e
1180                    ))
1181                })
1182            })?;
1183        Ok(())
1184    }
1185
1186    /// Validates the values stored within this [`ArrayData`] are valid
1187    /// without recursing into child [`ArrayData`]
1188    ///
1189    /// Does not (yet) check
1190    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1191    /// 2. the the null count is correct and that any
1192    /// 3. nullability requirements of its children are correct
1193    ///
1194    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1195    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1196        if let Some(nulls) = &self.nulls {
1197            let actual = nulls.len() - nulls.inner().count_set_bits();
1198            if actual != nulls.null_count() {
1199                return Err(ArrowError::InvalidArgumentError(format!(
1200                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1201                    nulls.null_count(),
1202                    actual
1203                )));
1204            }
1205        }
1206
1207        // In general non-nullable children should not contain nulls, however, for certain
1208        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1209        // space in the child. As such we permit nulls in the children in the corresponding
1210        // positions for such types
1211        match &self.data_type {
1212            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1213                if !f.is_nullable() {
1214                    self.validate_non_nullable(None, &self.child_data[0])?
1215                }
1216            }
1217            DataType::FixedSizeList(field, len) => {
1218                let child = &self.child_data[0];
1219                if !field.is_nullable() {
1220                    match &self.nulls {
1221                        Some(nulls) => {
1222                            let element_len = *len as usize;
1223                            let expanded = nulls.expand(element_len);
1224                            self.validate_non_nullable(Some(&expanded), child)?;
1225                        }
1226                        None => self.validate_non_nullable(None, child)?,
1227                    }
1228                }
1229            }
1230            DataType::Struct(fields) => {
1231                for (field, child) in fields.iter().zip(&self.child_data) {
1232                    if !field.is_nullable() {
1233                        self.validate_non_nullable(self.nulls(), child)?
1234                    }
1235                }
1236            }
1237            _ => {}
1238        }
1239
1240        Ok(())
1241    }
1242
1243    /// Verifies that `child` contains no nulls not present in `mask`
1244    fn validate_non_nullable(
1245        &self,
1246        mask: Option<&NullBuffer>,
1247        child: &ArrayData,
1248    ) -> Result<(), ArrowError> {
1249        let mask = match mask {
1250            Some(mask) => mask,
1251            None => {
1252                return match child.null_count() {
1253                    0 => Ok(()),
1254                    _ => Err(ArrowError::InvalidArgumentError(format!(
1255                        "non-nullable child of type {} contains nulls not present in parent {}",
1256                        child.data_type, self.data_type
1257                    ))),
1258                }
1259            }
1260        };
1261
1262        match child.nulls() {
1263            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1264                "non-nullable child of type {} contains nulls not present in parent",
1265                child.data_type
1266            ))),
1267            _ => Ok(()),
1268        }
1269    }
1270
1271    /// Validates the values stored within this [`ArrayData`] are valid
1272    /// without recursing into child [`ArrayData`]
1273    ///
1274    /// Does not (yet) check
1275    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1276    pub fn validate_values(&self) -> Result<(), ArrowError> {
1277        match &self.data_type {
1278            DataType::Utf8 => self.validate_utf8::<i32>(),
1279            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1280            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1281            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1282            DataType::BinaryView => {
1283                let views = self.typed_buffer::<u128>(0, self.len)?;
1284                validate_binary_view(views, &self.buffers[1..])
1285            }
1286            DataType::Utf8View => {
1287                let views = self.typed_buffer::<u128>(0, self.len)?;
1288                validate_string_view(views, &self.buffers[1..])
1289            }
1290            DataType::List(_) | DataType::Map(_, _) => {
1291                let child = &self.child_data[0];
1292                self.validate_offsets_full::<i32>(child.len)
1293            }
1294            DataType::LargeList(_) => {
1295                let child = &self.child_data[0];
1296                self.validate_offsets_full::<i64>(child.len)
1297            }
1298            DataType::Union(_, _) => {
1299                // Validate Union Array as part of implementing new Union semantics
1300                // See comments in `ArrayData::validate()`
1301                // https://github.com/apache/arrow-rs/issues/85
1302                //
1303                // TODO file follow on ticket for full union validation
1304                Ok(())
1305            }
1306            DataType::Dictionary(key_type, _value_type) => {
1307                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1308                let max_value = dictionary_length - 1;
1309                match key_type.as_ref() {
1310                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1311                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1312                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1313                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1314                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1315                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1316                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1317                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1318                    _ => unreachable!(),
1319                }
1320            }
1321            DataType::RunEndEncoded(run_ends, _values) => {
1322                let run_ends_data = self.child_data()[0].clone();
1323                match run_ends.data_type() {
1324                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1325                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1326                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1327                    _ => unreachable!(),
1328                }
1329            }
1330            _ => {
1331                // No extra validation check required for other types
1332                Ok(())
1333            }
1334        }
1335    }
1336
1337    /// Calls the `validate(item_index, range)` function for each of
1338    /// the ranges specified in the arrow offsets buffer of type
1339    /// `T`. Also validates that each offset is smaller than
1340    /// `offset_limit`
1341    ///
1342    /// For an empty array, the offsets buffer can either be empty
1343    /// or contain a single `0`.
1344    ///
1345    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1346    /// function would call `validate([1,2])`, and `validate([2,4])`
1347    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1348    where
1349        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1350        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1351    {
1352        self.typed_offsets::<T>()?
1353            .iter()
1354            .enumerate()
1355            .map(|(i, x)| {
1356                // check if the offset can be converted to usize
1357                let r = x.to_usize().ok_or_else(|| {
1358                    ArrowError::InvalidArgumentError(format!(
1359                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1360                    );
1361                // check if the offset exceeds the limit
1362                match r {
1363                    Ok(n) if n <= offset_limit => Ok((i, n)),
1364                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1365                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1366                    ),
1367                    Err(e) => Err(e),
1368                }
1369            })
1370            .scan(0_usize, |start, end| {
1371                // check offsets are monotonically increasing
1372                match end {
1373                    Ok((i, end)) if *start <= end => {
1374                        let range = Some(Ok((i, *start..end)));
1375                        *start = end;
1376                        range
1377                    }
1378                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1379                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1380                        i - 1, start, end))
1381                    )),
1382                    Err(err) => Some(Err(err)),
1383                }
1384            })
1385            .skip(1) // the first element is meaningless
1386            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1387                let (item_index, range) = res?;
1388                validate(item_index-1, range)
1389            })
1390    }
1391
1392    /// Ensures that all strings formed by the offsets in `buffers[0]`
1393    /// into `buffers[1]` are valid utf8 sequences
1394    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1395    where
1396        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1397    {
1398        let values_buffer = &self.buffers[1].as_slice();
1399        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1400            // Validate Offsets are correct
1401            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1402                if !values_str.is_char_boundary(range.start)
1403                    || !values_str.is_char_boundary(range.end)
1404                {
1405                    return Err(ArrowError::InvalidArgumentError(format!(
1406                        "incomplete utf-8 byte sequence from index {string_index}"
1407                    )));
1408                }
1409                Ok(())
1410            })
1411        } else {
1412            // find specific offset that failed utf8 validation
1413            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1414                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1415                    ArrowError::InvalidArgumentError(format!(
1416                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1417                    ))
1418                })?;
1419                Ok(())
1420            })
1421        }
1422    }
1423
1424    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1425    /// between `0` and `offset_limit`
1426    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1427    where
1428        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1429    {
1430        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1431            // No validation applied to each value, but the iteration
1432            // itself applies bounds checking to each range
1433            Ok(())
1434        })
1435    }
1436
1437    /// Validates that each value in self.buffers (typed as T)
1438    /// is within the range [0, max_value], inclusive
1439    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1440    where
1441        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1442    {
1443        let required_len = self.len + self.offset;
1444        let buffer = &self.buffers[0];
1445
1446        // This should have been checked as part of `validate()` prior
1447        // to calling `validate_full()` but double check to be sure
1448        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1449
1450        // Justification: buffer size was validated above
1451        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1452
1453        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1454            // Do not check the value is null (value can be arbitrary)
1455            if self.is_null(i) {
1456                return Ok(());
1457            }
1458            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1459                ArrowError::InvalidArgumentError(format!(
1460                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1461                ))
1462            })?;
1463
1464            if dict_index < 0 || dict_index > max_value {
1465                return Err(ArrowError::InvalidArgumentError(format!(
1466                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1467                )));
1468            }
1469            Ok(())
1470        })
1471    }
1472
1473    /// Validates that each value in run_ends array is positive and strictly increasing.
1474    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1475    where
1476        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1477    {
1478        let values = self.typed_buffer::<T>(0, self.len)?;
1479        let mut prev_value: i64 = 0_i64;
1480        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1481            let value: i64 = inp_value.try_into().map_err(|_| {
1482                ArrowError::InvalidArgumentError(format!(
1483                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1484                ))
1485            })?;
1486            if value <= 0_i64 {
1487                return Err(ArrowError::InvalidArgumentError(format!(
1488                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1489                )));
1490            }
1491            if ix > 0 && value <= prev_value {
1492                return Err(ArrowError::InvalidArgumentError(format!(
1493                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1494                )));
1495            }
1496
1497            prev_value = value;
1498            Ok(())
1499        })?;
1500
1501        if prev_value.as_usize() < (self.offset + self.len) {
1502            return Err(ArrowError::InvalidArgumentError(format!(
1503                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1504                self.offset + self.len
1505            )));
1506        }
1507        Ok(())
1508    }
1509
1510    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1511    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1512    /// return false when the arrays are logically equal
1513    pub fn ptr_eq(&self, other: &Self) -> bool {
1514        if self.offset != other.offset
1515            || self.len != other.len
1516            || self.data_type != other.data_type
1517            || self.buffers.len() != other.buffers.len()
1518            || self.child_data.len() != other.child_data.len()
1519        {
1520            return false;
1521        }
1522
1523        match (&self.nulls, &other.nulls) {
1524            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1525            (Some(_), None) | (None, Some(_)) => return false,
1526            _ => {}
1527        };
1528
1529        if !self
1530            .buffers
1531            .iter()
1532            .zip(other.buffers.iter())
1533            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1534        {
1535            return false;
1536        }
1537
1538        self.child_data
1539            .iter()
1540            .zip(other.child_data.iter())
1541            .all(|(a, b)| a.ptr_eq(b))
1542    }
1543
1544    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1545    pub fn into_builder(self) -> ArrayDataBuilder {
1546        self.into()
1547    }
1548}
1549
1550/// Return the expected [`DataTypeLayout`] Arrays of this data
1551/// type are expected to have
1552pub fn layout(data_type: &DataType) -> DataTypeLayout {
1553    // based on C/C++ implementation in
1554    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1555    use arrow_schema::IntervalUnit::*;
1556
1557    match data_type {
1558        DataType::Null => DataTypeLayout {
1559            buffers: vec![],
1560            can_contain_null_mask: false,
1561            variadic: false,
1562        },
1563        DataType::Boolean => DataTypeLayout {
1564            buffers: vec![BufferSpec::BitMap],
1565            can_contain_null_mask: true,
1566            variadic: false,
1567        },
1568        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1569        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1570        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1571        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1572        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1573        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1574        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1575        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1576        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1577        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1578        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1579        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1580        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1581        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1582        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1583        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1584        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1585        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1586        DataType::Interval(MonthDayNano) => {
1587            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1588        }
1589        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1590        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1591        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1592        DataType::FixedSizeBinary(size) => {
1593            let spec = BufferSpec::FixedWidth {
1594                byte_width: (*size).try_into().unwrap(),
1595                alignment: mem::align_of::<u8>(),
1596            };
1597            DataTypeLayout {
1598                buffers: vec![spec],
1599                can_contain_null_mask: true,
1600                variadic: false,
1601            }
1602        }
1603        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1604        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1605        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1606        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1607        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1608        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1609        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1610        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1611        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1612        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1613        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1614        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1615        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1616        DataType::Union(_, mode) => {
1617            let type_ids = BufferSpec::FixedWidth {
1618                byte_width: mem::size_of::<i8>(),
1619                alignment: mem::align_of::<i8>(),
1620            };
1621
1622            DataTypeLayout {
1623                buffers: match mode {
1624                    UnionMode::Sparse => {
1625                        vec![type_ids]
1626                    }
1627                    UnionMode::Dense => {
1628                        vec![
1629                            type_ids,
1630                            BufferSpec::FixedWidth {
1631                                byte_width: mem::size_of::<i32>(),
1632                                alignment: mem::align_of::<i32>(),
1633                            },
1634                        ]
1635                    }
1636                },
1637                can_contain_null_mask: false,
1638                variadic: false,
1639            }
1640        }
1641        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1642    }
1643}
1644
1645/// Layout specification for a data type
1646#[derive(Debug, PartialEq, Eq)]
1647// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1648pub struct DataTypeLayout {
1649    /// A vector of buffer layout specifications, one for each expected buffer
1650    pub buffers: Vec<BufferSpec>,
1651
1652    /// Can contain a null bitmask
1653    pub can_contain_null_mask: bool,
1654
1655    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1656    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1657    /// buffers.len(). Buffers that exceed the lower bound are legal.
1658    pub variadic: bool,
1659}
1660
1661impl DataTypeLayout {
1662    /// Describes a basic numeric array where each element has type `T`
1663    pub fn new_fixed_width<T>() -> Self {
1664        Self {
1665            buffers: vec![BufferSpec::FixedWidth {
1666                byte_width: mem::size_of::<T>(),
1667                alignment: mem::align_of::<T>(),
1668            }],
1669            can_contain_null_mask: true,
1670            variadic: false,
1671        }
1672    }
1673
1674    /// Describes arrays which have no data of their own
1675    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1676    pub fn new_nullable_empty() -> Self {
1677        Self {
1678            buffers: vec![],
1679            can_contain_null_mask: true,
1680            variadic: false,
1681        }
1682    }
1683
1684    /// Describes arrays which have no data of their own
1685    /// (e.g. RunEndEncoded).
1686    pub fn new_empty() -> Self {
1687        Self {
1688            buffers: vec![],
1689            can_contain_null_mask: false,
1690            variadic: false,
1691        }
1692    }
1693
1694    /// Describes a basic numeric array where each element has a fixed
1695    /// with offset buffer of type `T`, followed by a
1696    /// variable width data buffer
1697    pub fn new_binary<T>() -> Self {
1698        Self {
1699            buffers: vec![
1700                // offsets
1701                BufferSpec::FixedWidth {
1702                    byte_width: mem::size_of::<T>(),
1703                    alignment: mem::align_of::<T>(),
1704                },
1705                // values
1706                BufferSpec::VariableWidth,
1707            ],
1708            can_contain_null_mask: true,
1709            variadic: false,
1710        }
1711    }
1712
1713    /// Describes a view type
1714    pub fn new_view() -> Self {
1715        Self {
1716            buffers: vec![BufferSpec::FixedWidth {
1717                byte_width: mem::size_of::<u128>(),
1718                alignment: mem::align_of::<u128>(),
1719            }],
1720            can_contain_null_mask: true,
1721            variadic: true,
1722        }
1723    }
1724
1725    /// Describes a list view type
1726    pub fn new_list_view<T>() -> Self {
1727        Self {
1728            buffers: vec![
1729                BufferSpec::FixedWidth {
1730                    byte_width: mem::size_of::<T>(),
1731                    alignment: mem::align_of::<T>(),
1732                },
1733                BufferSpec::FixedWidth {
1734                    byte_width: mem::size_of::<T>(),
1735                    alignment: mem::align_of::<T>(),
1736                },
1737            ],
1738            can_contain_null_mask: true,
1739            variadic: true,
1740        }
1741    }
1742}
1743
1744/// Layout specification for a single data type buffer
1745#[derive(Debug, PartialEq, Eq)]
1746pub enum BufferSpec {
1747    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1748    ///
1749    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1750    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1751    ///
1752    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1753    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1754    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1755    ///
1756    /// Note that these alignment requirements will vary between architectures
1757    FixedWidth {
1758        /// The width of each element in bytes
1759        byte_width: usize,
1760        /// The alignment required by Rust for an array of the corresponding primitive
1761        alignment: usize,
1762    },
1763    /// Variable width, such as string data for utf8 data
1764    VariableWidth,
1765    /// Buffer holds a bitmap.
1766    ///
1767    /// Note: Unlike the C++ implementation, the null/validity buffer
1768    /// is handled specially rather than as another of the buffers in
1769    /// the spec, so this variant is only used for the Boolean type.
1770    BitMap,
1771    /// Buffer is always null. Unused currently in Rust implementation,
1772    /// (used in C++ for Union type)
1773    #[allow(dead_code)]
1774    AlwaysNull,
1775}
1776
1777impl PartialEq for ArrayData {
1778    fn eq(&self, other: &Self) -> bool {
1779        equal::equal(self, other)
1780    }
1781}
1782
1783/// A boolean flag that cannot be mutated outside of unsafe code.
1784///
1785/// Defaults to a value of false.
1786///
1787/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1788///
1789/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1790///
1791/// # Example
1792/// ```rust
1793/// use arrow_data::UnsafeFlag;
1794/// assert!(!UnsafeFlag::default().get()); // default is false
1795/// let mut flag = UnsafeFlag::new();
1796/// assert!(!flag.get()); // defaults to false
1797/// // can only set it to true in unsafe code
1798/// unsafe { flag.set(true) };
1799/// assert!(flag.get()); // now true
1800/// ```
1801#[derive(Debug, Clone)]
1802#[doc(hidden)]
1803pub struct UnsafeFlag(bool);
1804
1805impl UnsafeFlag {
1806    /// Creates a new `UnsafeFlag` with the value set to `false`.
1807    ///
1808    /// See examples on [`Self::new`]
1809    #[inline]
1810    pub const fn new() -> Self {
1811        Self(false)
1812    }
1813
1814    /// Sets the value of the flag to the given value
1815    ///
1816    /// Note this can purposely only be done in `unsafe` code
1817    ///
1818    /// # Safety
1819    ///
1820    /// If set, the flag will be set to the given value. There is nothing
1821    /// immediately unsafe about doing so, however, the flag can be used to
1822    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1823    #[inline]
1824    pub unsafe fn set(&mut self, val: bool) {
1825        self.0 = val;
1826    }
1827
1828    /// Returns the value of the flag
1829    #[inline]
1830    pub fn get(&self) -> bool {
1831        self.0
1832    }
1833}
1834
1835// Manual impl to make it clear you can not construct unsafe with true
1836impl Default for UnsafeFlag {
1837    fn default() -> Self {
1838        Self::new()
1839    }
1840}
1841
1842/// Builder for [`ArrayData`] type
1843#[derive(Debug)]
1844pub struct ArrayDataBuilder {
1845    data_type: DataType,
1846    len: usize,
1847    null_count: Option<usize>,
1848    null_bit_buffer: Option<Buffer>,
1849    nulls: Option<NullBuffer>,
1850    offset: usize,
1851    buffers: Vec<Buffer>,
1852    child_data: Vec<ArrayData>,
1853    /// Should buffers be realigned (copying if necessary)?
1854    ///
1855    /// Defaults to false.
1856    align_buffers: bool,
1857    /// Should data validation be skipped for this [`ArrayData`]?
1858    ///
1859    /// Defaults to false.
1860    ///
1861    /// # Safety
1862    ///
1863    /// This flag can only be set to true using `unsafe` APIs. However, once true
1864    /// subsequent calls to `build()` may result in undefined behavior if the data
1865    /// is not valid.
1866    skip_validation: UnsafeFlag,
1867}
1868
1869impl ArrayDataBuilder {
1870    #[inline]
1871    /// Creates a new array data builder
1872    pub const fn new(data_type: DataType) -> Self {
1873        Self {
1874            data_type,
1875            len: 0,
1876            null_count: None,
1877            null_bit_buffer: None,
1878            nulls: None,
1879            offset: 0,
1880            buffers: vec![],
1881            child_data: vec![],
1882            align_buffers: false,
1883            skip_validation: UnsafeFlag::new(),
1884        }
1885    }
1886
1887    /// Creates a new array data builder from an existing one, changing the data type
1888    pub fn data_type(self, data_type: DataType) -> Self {
1889        Self { data_type, ..self }
1890    }
1891
1892    #[inline]
1893    #[allow(clippy::len_without_is_empty)]
1894    /// Sets the length of the [ArrayData]
1895    pub const fn len(mut self, n: usize) -> Self {
1896        self.len = n;
1897        self
1898    }
1899
1900    /// Sets the null buffer of the [ArrayData]
1901    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1902        self.nulls = nulls;
1903        self.null_count = None;
1904        self.null_bit_buffer = None;
1905        self
1906    }
1907
1908    /// Sets the null count of the [ArrayData]
1909    pub fn null_count(mut self, null_count: usize) -> Self {
1910        self.null_count = Some(null_count);
1911        self
1912    }
1913
1914    /// Sets the `null_bit_buffer` of the [ArrayData]
1915    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1916        self.nulls = None;
1917        self.null_bit_buffer = buf;
1918        self
1919    }
1920
1921    /// Sets the offset of the [ArrayData]
1922    #[inline]
1923    pub const fn offset(mut self, n: usize) -> Self {
1924        self.offset = n;
1925        self
1926    }
1927
1928    /// Sets the buffers of the [ArrayData]
1929    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1930        self.buffers = v;
1931        self
1932    }
1933
1934    /// Adds a single buffer to the [ArrayData]'s buffers
1935    pub fn add_buffer(mut self, b: Buffer) -> Self {
1936        self.buffers.push(b);
1937        self
1938    }
1939
1940    /// Adds multiple buffers to the [ArrayData]'s buffers
1941    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1942        self.buffers.extend(bs);
1943        self
1944    }
1945
1946    /// Sets the child data of the [ArrayData]
1947    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1948        self.child_data = v;
1949        self
1950    }
1951
1952    /// Adds a single child data to the [ArrayData]'s child data
1953    pub fn add_child_data(mut self, r: ArrayData) -> Self {
1954        self.child_data.push(r);
1955        self
1956    }
1957
1958    /// Creates an array data, without any validation
1959    ///
1960    /// Note: This is shorthand for
1961    /// ```rust
1962    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
1963    /// # let _ = unsafe {
1964    /// builder.skip_validation(true).build().unwrap()
1965    /// # };
1966    /// ```
1967    ///
1968    /// # Safety
1969    ///
1970    /// The same caveats as [`ArrayData::new_unchecked`]
1971    /// apply.
1972    pub unsafe fn build_unchecked(self) -> ArrayData {
1973        self.skip_validation(true).build().unwrap()
1974    }
1975
1976    /// Creates an `ArrayData`, consuming `self`
1977    ///
1978    /// # Safety
1979    ///
1980    /// By default the underlying buffers are checked to ensure they are valid
1981    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
1982    /// to true (by the `unsafe` API) this validation is skipped. If the data is
1983    /// not valid, undefined behavior will result.
1984    pub fn build(self) -> Result<ArrayData, ArrowError> {
1985        let Self {
1986            data_type,
1987            len,
1988            null_count,
1989            null_bit_buffer,
1990            nulls,
1991            offset,
1992            buffers,
1993            child_data,
1994            align_buffers,
1995            skip_validation,
1996        } = self;
1997
1998        let nulls = nulls
1999            .or_else(|| {
2000                let buffer = null_bit_buffer?;
2001                let buffer = BooleanBuffer::new(buffer, offset, len);
2002                Some(match null_count {
2003                    Some(n) => {
2004                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2005                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2006                    }
2007                    None => NullBuffer::new(buffer),
2008                })
2009            })
2010            .filter(|b| b.null_count() != 0);
2011
2012        let mut data = ArrayData {
2013            data_type,
2014            len,
2015            offset,
2016            buffers,
2017            child_data,
2018            nulls,
2019        };
2020
2021        if align_buffers {
2022            data.align_buffers();
2023        }
2024
2025        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2026        if !skip_validation.get() || cfg!(feature = "force_validate") {
2027            data.validate_data()?;
2028        }
2029        Ok(data)
2030    }
2031
2032    /// Creates an array data, validating all inputs, and aligning any buffers
2033    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2034    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2035        self.align_buffers(true).build()
2036    }
2037
2038    /// Ensure that all buffers are aligned, copying data if necessary
2039    ///
2040    /// Rust requires that arrays are aligned to their corresponding primitive,
2041    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2042    ///
2043    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2044    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2045    ///
2046    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2047    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2048    /// when necessary, making it useful when interacting with buffers produced by other systems,
2049    /// e.g. IPC or FFI.
2050    ///
2051    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2052    /// insufficiently aligned buffers.
2053    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2054        self.align_buffers = align_buffers;
2055        self
2056    }
2057
2058    /// Skips validation of the data.
2059    ///
2060    /// If this flag is enabled, `[Self::build`] will skip validation of the
2061    /// data
2062    ///
2063    /// If this flag is not enabled, `[Self::build`] will validate that all
2064    /// buffers are valid and will return an error if any data is invalid.
2065    /// Validation can be expensive.
2066    ///
2067    /// # Safety
2068    ///
2069    /// If validation is skipped, the buffers must form a valid Arrow array,
2070    /// otherwise undefined behavior will result
2071    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2072        self.skip_validation.set(skip_validation);
2073        self
2074    }
2075}
2076
2077impl From<ArrayData> for ArrayDataBuilder {
2078    fn from(d: ArrayData) -> Self {
2079        Self {
2080            data_type: d.data_type,
2081            len: d.len,
2082            offset: d.offset,
2083            buffers: d.buffers,
2084            child_data: d.child_data,
2085            nulls: d.nulls,
2086            null_bit_buffer: None,
2087            null_count: None,
2088            align_buffers: false,
2089            skip_validation: UnsafeFlag::new(),
2090        }
2091    }
2092}
2093
2094#[cfg(test)]
2095mod tests {
2096    use super::*;
2097    use arrow_schema::{Field, Fields};
2098
2099    // See arrow/tests/array_data_validation.rs for test of array validation
2100
2101    /// returns a buffer initialized with some constant value for tests
2102    fn make_i32_buffer(n: usize) -> Buffer {
2103        Buffer::from_slice_ref(vec![42i32; n])
2104    }
2105
2106    /// returns a buffer initialized with some constant value for tests
2107    fn make_f32_buffer(n: usize) -> Buffer {
2108        Buffer::from_slice_ref(vec![42f32; n])
2109    }
2110
2111    #[test]
2112    fn test_builder() {
2113        // Buffer needs to be at least 25 long
2114        let v = (0..25).collect::<Vec<i32>>();
2115        let b1 = Buffer::from_slice_ref(&v);
2116        let arr_data = ArrayData::builder(DataType::Int32)
2117            .len(20)
2118            .offset(5)
2119            .add_buffer(b1)
2120            .null_bit_buffer(Some(Buffer::from([
2121                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2122            ])))
2123            .build()
2124            .unwrap();
2125
2126        assert_eq!(20, arr_data.len());
2127        assert_eq!(10, arr_data.null_count());
2128        assert_eq!(5, arr_data.offset());
2129        assert_eq!(1, arr_data.buffers().len());
2130        assert_eq!(
2131            Buffer::from_slice_ref(&v).as_slice(),
2132            arr_data.buffers()[0].as_slice()
2133        );
2134    }
2135
2136    #[test]
2137    fn test_builder_with_child_data() {
2138        let child_arr_data = ArrayData::try_new(
2139            DataType::Int32,
2140            5,
2141            None,
2142            0,
2143            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2144            vec![],
2145        )
2146        .unwrap();
2147
2148        let field = Arc::new(Field::new("x", DataType::Int32, true));
2149        let data_type = DataType::Struct(vec![field].into());
2150
2151        let arr_data = ArrayData::builder(data_type)
2152            .len(5)
2153            .offset(0)
2154            .add_child_data(child_arr_data.clone())
2155            .build()
2156            .unwrap();
2157
2158        assert_eq!(5, arr_data.len());
2159        assert_eq!(1, arr_data.child_data().len());
2160        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2161    }
2162
2163    #[test]
2164    fn test_null_count() {
2165        let mut bit_v: [u8; 2] = [0; 2];
2166        bit_util::set_bit(&mut bit_v, 0);
2167        bit_util::set_bit(&mut bit_v, 3);
2168        bit_util::set_bit(&mut bit_v, 10);
2169        let arr_data = ArrayData::builder(DataType::Int32)
2170            .len(16)
2171            .add_buffer(make_i32_buffer(16))
2172            .null_bit_buffer(Some(Buffer::from(bit_v)))
2173            .build()
2174            .unwrap();
2175        assert_eq!(13, arr_data.null_count());
2176
2177        // Test with offset
2178        let mut bit_v: [u8; 2] = [0; 2];
2179        bit_util::set_bit(&mut bit_v, 0);
2180        bit_util::set_bit(&mut bit_v, 3);
2181        bit_util::set_bit(&mut bit_v, 10);
2182        let arr_data = ArrayData::builder(DataType::Int32)
2183            .len(12)
2184            .offset(2)
2185            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2186            .null_bit_buffer(Some(Buffer::from(bit_v)))
2187            .build()
2188            .unwrap();
2189        assert_eq!(10, arr_data.null_count());
2190    }
2191
2192    #[test]
2193    fn test_null_buffer_ref() {
2194        let mut bit_v: [u8; 2] = [0; 2];
2195        bit_util::set_bit(&mut bit_v, 0);
2196        bit_util::set_bit(&mut bit_v, 3);
2197        bit_util::set_bit(&mut bit_v, 10);
2198        let arr_data = ArrayData::builder(DataType::Int32)
2199            .len(16)
2200            .add_buffer(make_i32_buffer(16))
2201            .null_bit_buffer(Some(Buffer::from(bit_v)))
2202            .build()
2203            .unwrap();
2204        assert!(arr_data.nulls().is_some());
2205        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2206    }
2207
2208    #[test]
2209    fn test_slice() {
2210        let mut bit_v: [u8; 2] = [0; 2];
2211        bit_util::set_bit(&mut bit_v, 0);
2212        bit_util::set_bit(&mut bit_v, 3);
2213        bit_util::set_bit(&mut bit_v, 10);
2214        let data = ArrayData::builder(DataType::Int32)
2215            .len(16)
2216            .add_buffer(make_i32_buffer(16))
2217            .null_bit_buffer(Some(Buffer::from(bit_v)))
2218            .build()
2219            .unwrap();
2220        let new_data = data.slice(1, 15);
2221        assert_eq!(data.len() - 1, new_data.len());
2222        assert_eq!(1, new_data.offset());
2223        assert_eq!(data.null_count(), new_data.null_count());
2224
2225        // slice of a slice (removes one null)
2226        let new_data = new_data.slice(1, 14);
2227        assert_eq!(data.len() - 2, new_data.len());
2228        assert_eq!(2, new_data.offset());
2229        assert_eq!(data.null_count() - 1, new_data.null_count());
2230    }
2231
2232    #[test]
2233    fn test_equality() {
2234        let int_data = ArrayData::builder(DataType::Int32)
2235            .len(1)
2236            .add_buffer(make_i32_buffer(1))
2237            .build()
2238            .unwrap();
2239
2240        let float_data = ArrayData::builder(DataType::Float32)
2241            .len(1)
2242            .add_buffer(make_f32_buffer(1))
2243            .build()
2244            .unwrap();
2245        assert_ne!(int_data, float_data);
2246        assert!(!int_data.ptr_eq(&float_data));
2247        assert!(int_data.ptr_eq(&int_data));
2248
2249        #[allow(clippy::redundant_clone)]
2250        let int_data_clone = int_data.clone();
2251        assert_eq!(int_data, int_data_clone);
2252        assert!(int_data.ptr_eq(&int_data_clone));
2253        assert!(int_data_clone.ptr_eq(&int_data));
2254
2255        let int_data_slice = int_data_clone.slice(1, 0);
2256        assert!(int_data_slice.ptr_eq(&int_data_slice));
2257        assert!(!int_data.ptr_eq(&int_data_slice));
2258        assert!(!int_data_slice.ptr_eq(&int_data));
2259
2260        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2261        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2262        let string_data = ArrayData::try_new(
2263            DataType::Utf8,
2264            3,
2265            Some(Buffer::from_iter(vec![true, false, true])),
2266            0,
2267            vec![offsets_buffer, data_buffer],
2268            vec![],
2269        )
2270        .unwrap();
2271
2272        assert_ne!(float_data, string_data);
2273        assert!(!float_data.ptr_eq(&string_data));
2274
2275        assert!(string_data.ptr_eq(&string_data));
2276
2277        #[allow(clippy::redundant_clone)]
2278        let string_data_cloned = string_data.clone();
2279        assert!(string_data_cloned.ptr_eq(&string_data));
2280        assert!(string_data.ptr_eq(&string_data_cloned));
2281
2282        let string_data_slice = string_data.slice(1, 2);
2283        assert!(string_data_slice.ptr_eq(&string_data_slice));
2284        assert!(!string_data_slice.ptr_eq(&string_data))
2285    }
2286
2287    #[test]
2288    fn test_slice_memory_size() {
2289        let mut bit_v: [u8; 2] = [0; 2];
2290        bit_util::set_bit(&mut bit_v, 0);
2291        bit_util::set_bit(&mut bit_v, 3);
2292        bit_util::set_bit(&mut bit_v, 10);
2293        let data = ArrayData::builder(DataType::Int32)
2294            .len(16)
2295            .add_buffer(make_i32_buffer(16))
2296            .null_bit_buffer(Some(Buffer::from(bit_v)))
2297            .build()
2298            .unwrap();
2299        let new_data = data.slice(1, 14);
2300        assert_eq!(
2301            data.get_slice_memory_size().unwrap() - 8,
2302            new_data.get_slice_memory_size().unwrap()
2303        );
2304        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2305        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2306        let string_data = ArrayData::try_new(
2307            DataType::Utf8,
2308            3,
2309            Some(Buffer::from_iter(vec![true, false, true])),
2310            0,
2311            vec![offsets_buffer, data_buffer],
2312            vec![],
2313        )
2314        .unwrap();
2315        let string_data_slice = string_data.slice(1, 2);
2316        //4 bytes of offset and 2 bytes of data reduced by slicing.
2317        assert_eq!(
2318            string_data.get_slice_memory_size().unwrap() - 6,
2319            string_data_slice.get_slice_memory_size().unwrap()
2320        );
2321    }
2322
2323    #[test]
2324    fn test_count_nulls() {
2325        let buffer = Buffer::from([0b00010110, 0b10011111]);
2326        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2327        let count = count_nulls(Some(&buffer), 0, 16);
2328        assert_eq!(count, 7);
2329
2330        let count = count_nulls(Some(&buffer), 4, 8);
2331        assert_eq!(count, 3);
2332    }
2333
2334    #[test]
2335    fn test_contains_nulls() {
2336        let buffer: Buffer =
2337            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2338        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2339        assert!(contains_nulls(Some(&buffer), 0, 6));
2340        assert!(contains_nulls(Some(&buffer), 0, 3));
2341        assert!(!contains_nulls(Some(&buffer), 3, 2));
2342        assert!(!contains_nulls(Some(&buffer), 0, 0));
2343    }
2344
2345    #[test]
2346    fn test_alignment() {
2347        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2348        let sliced = buffer.slice(1);
2349
2350        let mut data = ArrayData {
2351            data_type: DataType::Int32,
2352            len: 0,
2353            offset: 0,
2354            buffers: vec![buffer],
2355            child_data: vec![],
2356            nulls: None,
2357        };
2358        data.validate_full().unwrap();
2359
2360        // break alignment in data
2361        data.buffers[0] = sliced;
2362        let err = data.validate().unwrap_err();
2363
2364        assert_eq!(
2365            err.to_string(),
2366            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2367        );
2368
2369        data.align_buffers();
2370        data.validate_full().unwrap();
2371    }
2372
2373    #[test]
2374    fn test_alignment_struct() {
2375        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2376        let sliced = buffer.slice(1);
2377
2378        let child_data = ArrayData {
2379            data_type: DataType::Int32,
2380            len: 0,
2381            offset: 0,
2382            buffers: vec![buffer],
2383            child_data: vec![],
2384            nulls: None,
2385        };
2386
2387        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2388        let mut data = ArrayData {
2389            data_type: schema,
2390            len: 0,
2391            offset: 0,
2392            buffers: vec![],
2393            child_data: vec![child_data],
2394            nulls: None,
2395        };
2396        data.validate_full().unwrap();
2397
2398        // break alignment in child data
2399        data.child_data[0].buffers[0] = sliced;
2400        let err = data.validate().unwrap_err();
2401
2402        assert_eq!(
2403            err.to_string(),
2404            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2405        );
2406
2407        data.align_buffers();
2408        data.validate_full().unwrap();
2409    }
2410
2411    #[test]
2412    fn test_null_view_types() {
2413        let array_len = 32;
2414        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2415        assert_eq!(array.len(), array_len);
2416        for i in 0..array.len() {
2417            assert!(array.is_null(i));
2418        }
2419
2420        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2421        assert_eq!(array.len(), array_len);
2422        for i in 0..array.len() {
2423            assert!(array.is_null(i));
2424        }
2425    }
2426}