Skip to main content

arrow_array/array/
struct_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::{Array, ArrayRef, RecordBatch, make_array, new_null_array};
20use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer};
21use arrow_data::{ArrayData, ArrayDataBuilder};
22use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
23use std::sync::Arc;
24use std::{any::Any, ops::Index};
25
26/// An array of [structs](https://arrow.apache.org/docs/format/Columnar.html#struct-layout)
27///
28/// Each child (called *field*) is represented by a separate array.
29///
30/// # Comparison with [RecordBatch]
31///
32/// Both [`RecordBatch`] and [`StructArray`] represent a collection of columns / arrays with the
33/// same length.
34///
35/// However, there are a couple of key differences:
36///
37/// * [`StructArray`] can be nested within other [`Array`], including itself
38/// * [`RecordBatch`] can contain top-level metadata on its associated [`Schema`][arrow_schema::Schema]
39/// * [`StructArray`] can contain top-level nulls, i.e. `null`
40/// * [`RecordBatch`] can only represent nulls in its child columns, i.e. `{"field": null}`
41///
42/// [`StructArray`] is therefore a more general data container than [`RecordBatch`], and as such
43/// code that needs to handle both will typically share an implementation in terms of
44/// [`StructArray`] and convert to/from [`RecordBatch`] as necessary.
45///
46/// [`From`] implementations are provided to facilitate this conversion, however, converting
47/// from a [`StructArray`] containing top-level nulls to a [`RecordBatch`] will panic, as there
48/// is no way to preserve them.
49///
50/// # Example: Create an array from a vector of fields
51///
52/// ```
53/// use std::sync::Arc;
54/// use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, StructArray};
55/// use arrow_schema::{DataType, Field};
56///
57/// let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
58/// let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
59///
60/// let struct_array = StructArray::from(vec![
61///     (
62///         Arc::new(Field::new("b", DataType::Boolean, false)),
63///         boolean.clone() as ArrayRef,
64///     ),
65///     (
66///         Arc::new(Field::new("c", DataType::Int32, false)),
67///         int.clone() as ArrayRef,
68///     ),
69/// ]);
70/// assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref());
71/// assert_eq!(struct_array.column(1).as_ref(), int.as_ref());
72/// assert_eq!(4, struct_array.len());
73/// assert_eq!(0, struct_array.null_count());
74/// assert_eq!(0, struct_array.offset());
75/// ```
76#[derive(Clone)]
77pub struct StructArray {
78    len: usize,
79    data_type: DataType,
80    nulls: Option<NullBuffer>,
81    fields: Vec<ArrayRef>,
82}
83
84impl StructArray {
85    /// Create a new [`StructArray`] from the provided parts, panicking on failure
86    ///
87    /// # Panics
88    ///
89    /// Panics if [`Self::try_new`] returns an error
90    pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls: Option<NullBuffer>) -> Self {
91        Self::try_new(fields, arrays, nulls).unwrap()
92    }
93
94    /// Create a new [`StructArray`] from the provided parts, returning an error on failure
95    ///
96    /// The length will be inferred from the length of the child arrays.  Returns an error if
97    /// there are no child arrays.  Consider using [`Self::try_new_with_length`] if the length
98    /// is known to avoid this.
99    ///
100    /// # Errors
101    ///
102    /// Errors if
103    ///
104    /// * `fields.len() == 0`
105    /// * Any reason that [`Self::try_new_with_length`] would error
106    pub fn try_new(
107        fields: Fields,
108        arrays: Vec<ArrayRef>,
109        nulls: Option<NullBuffer>,
110    ) -> Result<Self, ArrowError> {
111        let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::try_new_with_length or StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly".to_string()))?;
112
113        Self::try_new_with_length(fields, arrays, nulls, len)
114    }
115
116    /// Create a new [`StructArray`] from the provided parts, returning an error on failure
117    ///
118    /// # Errors
119    ///
120    /// Errors if
121    ///
122    /// * `fields.len() != arrays.len()`
123    /// * `fields[i].data_type() != arrays[i].data_type()`
124    /// * `arrays[i].len() != arrays[j].len()`
125    /// * `arrays[i].len() != nulls.len()`
126    /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
127    pub fn try_new_with_length(
128        fields: Fields,
129        arrays: Vec<ArrayRef>,
130        nulls: Option<NullBuffer>,
131        len: usize,
132    ) -> Result<Self, ArrowError> {
133        if fields.len() != arrays.len() {
134            return Err(ArrowError::InvalidArgumentError(format!(
135                "Incorrect number of arrays for StructArray fields, expected {} got {}",
136                fields.len(),
137                arrays.len()
138            )));
139        }
140
141        if let Some(n) = nulls.as_ref() {
142            if n.len() != len {
143                return Err(ArrowError::InvalidArgumentError(format!(
144                    "Incorrect number of nulls for StructArray, expected {len} got {}",
145                    n.len(),
146                )));
147            }
148        }
149
150        for (f, a) in fields.iter().zip(&arrays) {
151            if f.data_type() != a.data_type() {
152                return Err(ArrowError::InvalidArgumentError(format!(
153                    "Incorrect datatype for StructArray field {:?}, expected {} got {}",
154                    f.name(),
155                    f.data_type(),
156                    a.data_type()
157                )));
158            }
159
160            if a.len() != len {
161                return Err(ArrowError::InvalidArgumentError(format!(
162                    "Incorrect array length for StructArray field {:?}, expected {} got {}",
163                    f.name(),
164                    len,
165                    a.len()
166                )));
167            }
168
169            if !f.is_nullable() {
170                if let Some(a) = a.logical_nulls() {
171                    if !nulls.as_ref().map(|n| n.contains(&a)).unwrap_or_default()
172                        && a.null_count() > 0
173                    {
174                        return Err(ArrowError::InvalidArgumentError(format!(
175                            "Found unmasked nulls for non-nullable StructArray field {:?}",
176                            f.name()
177                        )));
178                    }
179                }
180            }
181        }
182
183        Ok(Self {
184            len,
185            data_type: DataType::Struct(fields),
186            nulls: nulls.filter(|n| n.null_count() > 0),
187            fields: arrays,
188        })
189    }
190
191    /// Create a new [`StructArray`] of length `len` where all values are null
192    pub fn new_null(fields: Fields, len: usize) -> Self {
193        let arrays = fields
194            .iter()
195            .map(|f| new_null_array(f.data_type(), len))
196            .collect();
197
198        Self {
199            len,
200            data_type: DataType::Struct(fields),
201            nulls: Some(NullBuffer::new_null(len)),
202            fields: arrays,
203        }
204    }
205
206    /// Create a new [`StructArray`] from the provided parts without validation
207    ///
208    /// The length will be inferred from the length of the child arrays.  Panics if there are no
209    /// child arrays.  Consider using [`Self::new_unchecked_with_length`] if the length is known
210    /// to avoid this.
211    ///
212    /// # Safety
213    ///
214    /// Safe if [`Self::new`] would not panic with the given arguments
215    pub unsafe fn new_unchecked(
216        fields: Fields,
217        arrays: Vec<ArrayRef>,
218        nulls: Option<NullBuffer>,
219    ) -> Self {
220        if cfg!(feature = "force_validate") {
221            return Self::new(fields, arrays, nulls);
222        }
223
224        let len = arrays.first().map(|x| x.len()).expect(
225            "cannot use StructArray::new_unchecked if there are no fields, length is unknown",
226        );
227        Self {
228            len,
229            data_type: DataType::Struct(fields),
230            nulls,
231            fields: arrays,
232        }
233    }
234
235    /// Create a new [`StructArray`] from the provided parts without validation
236    ///
237    /// # Safety
238    ///
239    /// Safe if [`Self::new`] would not panic with the given arguments
240    pub unsafe fn new_unchecked_with_length(
241        fields: Fields,
242        arrays: Vec<ArrayRef>,
243        nulls: Option<NullBuffer>,
244        len: usize,
245    ) -> Self {
246        if cfg!(feature = "force_validate") {
247            return Self::try_new_with_length(fields, arrays, nulls, len).unwrap();
248        }
249
250        Self {
251            len,
252            data_type: DataType::Struct(fields),
253            nulls,
254            fields: arrays,
255        }
256    }
257
258    /// Create a new [`StructArray`] containing no fields
259    ///
260    /// # Panics
261    ///
262    /// If `len != nulls.len()`
263    pub fn new_empty_fields(len: usize, nulls: Option<NullBuffer>) -> Self {
264        if let Some(n) = &nulls {
265            assert_eq!(len, n.len())
266        }
267        Self {
268            len,
269            data_type: DataType::Struct(Fields::empty()),
270            fields: vec![],
271            nulls,
272        }
273    }
274
275    /// Deconstruct this array into its constituent parts
276    pub fn into_parts(self) -> (Fields, Vec<ArrayRef>, Option<NullBuffer>) {
277        let f = match self.data_type {
278            DataType::Struct(f) => f,
279            _ => unreachable!(),
280        };
281        (f, self.fields, self.nulls)
282    }
283
284    /// Returns the field at `pos`.
285    pub fn column(&self, pos: usize) -> &ArrayRef {
286        &self.fields[pos]
287    }
288
289    /// Return the number of fields in this struct array
290    pub fn num_columns(&self) -> usize {
291        self.fields.len()
292    }
293
294    /// Returns the fields of the struct array
295    pub fn columns(&self) -> &[ArrayRef] {
296        &self.fields
297    }
298
299    /// Return field names in this struct array
300    pub fn column_names(&self) -> Vec<&str> {
301        match self.data_type() {
302            DataType::Struct(fields) => fields
303                .iter()
304                .map(|f| f.name().as_str())
305                .collect::<Vec<&str>>(),
306            _ => unreachable!("Struct array's data type is not struct!"),
307        }
308    }
309
310    /// Returns the [`Fields`] of this [`StructArray`]
311    pub fn fields(&self) -> &Fields {
312        match self.data_type() {
313            DataType::Struct(f) => f,
314            _ => unreachable!(),
315        }
316    }
317
318    /// Return child array whose field name equals to column_name
319    ///
320    /// Note: A schema can currently have duplicate field names, in which case
321    /// the first field will always be selected.
322    /// This issue will be addressed in [#9205](https://github.com/apache/arrow-rs/issues/9205)
323    pub fn column_by_name(&self, column_name: &str) -> Option<&ArrayRef> {
324        self.fields()
325            .find(column_name)
326            .map(|(pos, _)| self.column(pos))
327    }
328
329    /// Returns the [`FieldRef`] at `pos`.
330    pub fn field(&self, pos: usize) -> &FieldRef {
331        &self.fields()[pos]
332    }
333
334    /// Return the [`FieldRef`] whose name equals to `field_name`
335    ///
336    /// Note: A schema can currently have duplicate field names, in which case
337    /// the first field will always be selected.
338    /// This issue will be addressed in [#9205](https://github.com/apache/arrow-rs/issues/9205)
339    pub fn field_by_name(&self, field_name: &str) -> Option<&FieldRef> {
340        self.fields().find(field_name).map(|(_, field)| field)
341    }
342
343    /// Returns a zero-copy slice of this array with the indicated offset and length.
344    pub fn slice(&self, offset: usize, len: usize) -> Self {
345        assert!(
346            offset.saturating_add(len) <= self.len,
347            "the length + offset of the sliced StructArray cannot exceed the existing length"
348        );
349
350        let fields = self.fields.iter().map(|a| a.slice(offset, len)).collect();
351
352        Self {
353            len,
354            data_type: self.data_type.clone(),
355            nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
356            fields,
357        }
358    }
359
360    /// Returns the children of this [`StructArray`] with the struct's validity
361    /// bitmap AND'd into each child's validity bitmap.
362    ///
363    /// This ensures that positions where the struct itself is null are also
364    /// null in each returned child array. Fields that were non-nullable are
365    /// marked nullable in the returned [`Fields`] when the struct has nulls.
366    ///
367    /// If the struct has no nulls, children and fields are returned as-is.
368    ///
369    /// This mirrors the semantics of C++ Arrow's `StructArray::Flatten`.
370    ///
371    /// # Example
372    ///
373    /// ```
374    /// # use std::sync::Arc;
375    /// # use arrow_array::{Array, ArrayRef, Int32Array, StructArray};
376    /// # use arrow_buffer::{BooleanBuffer, NullBuffer};
377    /// # use arrow_schema::{DataType, Field, Fields};
378    /// let child = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
379    /// let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
380    /// let sa = StructArray::new(
381    ///     Fields::from(vec![Field::new("a", DataType::Int32, false)]),
382    ///     vec![child],
383    ///     Some(struct_nulls),
384    /// );
385    /// let (fields, columns) = sa.flatten();
386    /// assert!(fields[0].is_nullable());
387    /// assert!(columns[0].is_null(1));
388    /// ```
389    pub fn flatten(&self) -> (Fields, Vec<ArrayRef>) {
390        let schema_fields = self.fields();
391
392        let struct_nulls = match &self.nulls {
393            Some(n) => n,
394            None => return (schema_fields.clone(), self.fields.clone()),
395        };
396
397        let new_fields: Fields = schema_fields
398            .iter()
399            .map(|f| {
400                if f.is_nullable() {
401                    Arc::clone(f)
402                } else {
403                    Arc::new(f.as_ref().clone().with_nullable(true))
404                }
405            })
406            .collect::<Vec<_>>()
407            .into();
408
409        let new_columns = self
410            .fields
411            .iter()
412            .map(|child| {
413                let merged = NullBuffer::union(Some(struct_nulls), child.nulls());
414                // SAFETY: We only make the null buffer more restrictive (adding nulls).
415                // All data buffers and child data remain unchanged.
416                let data = child.to_data().into_builder().nulls(merged);
417                make_array(unsafe { data.build_unchecked() })
418            })
419            .collect();
420
421        (new_fields, new_columns)
422    }
423}
424
425impl From<ArrayData> for StructArray {
426    fn from(data: ArrayData) -> Self {
427        let (data_type, len, nulls, offset, _buffers, child_data) = data.into_parts();
428
429        let parent_offset = offset;
430        let parent_len = len;
431
432        let fields = child_data
433            .into_iter()
434            .map(|cd| {
435                if parent_offset != 0 || parent_len != cd.len() {
436                    make_array(cd.slice(parent_offset, parent_len))
437                } else {
438                    make_array(cd)
439                }
440            })
441            .collect();
442
443        Self {
444            len,
445            data_type,
446            nulls,
447            fields,
448        }
449    }
450}
451
452impl From<StructArray> for ArrayData {
453    fn from(array: StructArray) -> Self {
454        let builder = ArrayDataBuilder::new(array.data_type)
455            .len(array.len)
456            .nulls(array.nulls)
457            .child_data(array.fields.iter().map(|x| x.to_data()).collect());
458
459        unsafe { builder.build_unchecked() }
460    }
461}
462
463impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
464    type Error = ArrowError;
465
466    /// builds a StructArray from a vector of names and arrays.
467    fn try_from(values: Vec<(&str, ArrayRef)>) -> Result<Self, ArrowError> {
468        let (fields, arrays): (Vec<_>, _) = values
469            .into_iter()
470            .map(|(name, array)| {
471                (
472                    Field::new(name, array.data_type().clone(), array.is_nullable()),
473                    array,
474                )
475            })
476            .unzip();
477
478        StructArray::try_new(fields.into(), arrays, None)
479    }
480}
481
482/// SAFETY: Correctly implements the contract of Arrow Arrays
483unsafe impl Array for StructArray {
484    fn as_any(&self) -> &dyn Any {
485        self
486    }
487
488    fn to_data(&self) -> ArrayData {
489        self.clone().into()
490    }
491
492    fn into_data(self) -> ArrayData {
493        self.into()
494    }
495
496    fn data_type(&self) -> &DataType {
497        &self.data_type
498    }
499
500    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
501        Arc::new(self.slice(offset, length))
502    }
503
504    fn len(&self) -> usize {
505        self.len
506    }
507
508    fn is_empty(&self) -> bool {
509        self.len == 0
510    }
511
512    fn shrink_to_fit(&mut self) {
513        if let Some(nulls) = &mut self.nulls {
514            nulls.shrink_to_fit();
515        }
516        self.fields.iter_mut().for_each(|n| n.shrink_to_fit());
517    }
518
519    fn offset(&self) -> usize {
520        0
521    }
522
523    fn nulls(&self) -> Option<&NullBuffer> {
524        self.nulls.as_ref()
525    }
526
527    fn logical_null_count(&self) -> usize {
528        // More efficient that the default implementation
529        self.null_count()
530    }
531
532    fn get_buffer_memory_size(&self) -> usize {
533        let mut size = self.fields.iter().map(|a| a.get_buffer_memory_size()).sum();
534        if let Some(n) = self.nulls.as_ref() {
535            size += n.buffer().capacity();
536        }
537        size
538    }
539
540    fn get_array_memory_size(&self) -> usize {
541        let mut size = self.fields.iter().map(|a| a.get_array_memory_size()).sum();
542        size += std::mem::size_of::<Self>();
543        if let Some(n) = self.nulls.as_ref() {
544            size += n.buffer().capacity();
545        }
546        size
547    }
548
549    #[cfg(feature = "pool")]
550    fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
551        for field in &self.fields {
552            field.claim(pool);
553        }
554        if let Some(nulls) = &self.nulls {
555            nulls.claim(pool);
556        }
557    }
558}
559
560impl From<Vec<(FieldRef, ArrayRef)>> for StructArray {
561    fn from(v: Vec<(FieldRef, ArrayRef)>) -> Self {
562        let (fields, arrays): (Vec<_>, _) = v.into_iter().unzip();
563        StructArray::new(fields.into(), arrays, None)
564    }
565}
566
567impl std::fmt::Debug for StructArray {
568    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
569        writeln!(f, "StructArray")?;
570        writeln!(f, "-- validity:")?;
571        writeln!(f, "[")?;
572        print_long_array(self, f, |_array, _index, f| write!(f, "valid"))?;
573        writeln!(f, "]\n[")?;
574        for (child_index, name) in self.column_names().iter().enumerate() {
575            let column = self.column(child_index);
576            writeln!(
577                f,
578                "-- child {}: \"{}\" ({:?})",
579                child_index,
580                name,
581                column.data_type()
582            )?;
583            std::fmt::Debug::fmt(column, f)?;
584            writeln!(f)?;
585        }
586        write!(f, "]")
587    }
588}
589
590impl From<(Vec<(FieldRef, ArrayRef)>, Buffer)> for StructArray {
591    fn from(pair: (Vec<(FieldRef, ArrayRef)>, Buffer)) -> Self {
592        let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default();
593        let (fields, arrays): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip();
594        let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len));
595        Self::new(fields.into(), arrays, Some(nulls))
596    }
597}
598
599impl From<RecordBatch> for StructArray {
600    fn from(value: RecordBatch) -> Self {
601        Self {
602            len: value.num_rows(),
603            data_type: DataType::Struct(value.schema().fields().clone()),
604            nulls: None,
605            fields: value.columns().to_vec(),
606        }
607    }
608}
609
610impl Index<&str> for StructArray {
611    type Output = ArrayRef;
612
613    /// Get a reference to a column's array by name.
614    ///
615    /// Note: A schema can currently have duplicate field names, in which case
616    /// the first field will always be selected.
617    /// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178)
618    ///
619    /// # Panics
620    ///
621    /// Panics if the name is not in the schema.
622    fn index(&self, name: &str) -> &Self::Output {
623        self.column_by_name(name).unwrap()
624    }
625}
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630
631    use crate::{BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray};
632    use arrow_buffer::ToByteSlice;
633
634    #[test]
635    fn test_struct_array_builder() {
636        let boolean_array = BooleanArray::from(vec![false, false, true, true]);
637        let int_array = Int64Array::from(vec![42, 28, 19, 31]);
638
639        let fields = vec![
640            Field::new("a", DataType::Boolean, false),
641            Field::new("b", DataType::Int64, false),
642        ];
643        let struct_array_data = ArrayData::builder(DataType::Struct(fields.into()))
644            .len(4)
645            .add_child_data(boolean_array.to_data())
646            .add_child_data(int_array.to_data())
647            .build()
648            .unwrap();
649        let struct_array = StructArray::from(struct_array_data);
650
651        assert_eq!(struct_array.column(0).as_ref(), &boolean_array);
652        assert_eq!(struct_array.column(1).as_ref(), &int_array);
653    }
654
655    #[test]
656    fn test_struct_array_from() {
657        let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
658        let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
659
660        let struct_array = StructArray::from(vec![
661            (
662                Arc::new(Field::new("b", DataType::Boolean, false)),
663                boolean.clone() as ArrayRef,
664            ),
665            (
666                Arc::new(Field::new("c", DataType::Int32, false)),
667                int.clone() as ArrayRef,
668            ),
669        ]);
670        assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref());
671        assert_eq!(struct_array.column(1).as_ref(), int.as_ref());
672        assert_eq!(4, struct_array.len());
673        assert_eq!(0, struct_array.null_count());
674        assert_eq!(0, struct_array.offset());
675    }
676
677    #[test]
678    fn test_struct_array_from_data_with_offset_and_length() {
679        // Various ways to make the struct array:
680        //
681        // [{x: 2}, {x: 3}, None]
682        //
683        // from slicing larger buffers/arrays with offsets and lengths
684        let int_arr = Int32Array::from(vec![1, 2, 3, 4, 5]);
685        let int_field = Field::new("x", DataType::Int32, false);
686        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, true, false]));
687        let int_data = int_arr.to_data();
688        // Case 1: Offset + length, nulls are not sliced
689        let case1 = ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()])))
690            .len(3)
691            .offset(1)
692            .nulls(Some(struct_nulls))
693            .add_child_data(int_data.clone())
694            .build()
695            .unwrap();
696
697        // Case 2: Offset + length, nulls are sliced
698        let struct_nulls =
699            NullBuffer::new(BooleanBuffer::from(vec![true, true, true, false, true]).slice(1, 3));
700        let case2 = ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()])))
701            .len(3)
702            .offset(1)
703            .nulls(Some(struct_nulls.clone()))
704            .add_child_data(int_data.clone())
705            .build()
706            .unwrap();
707
708        // Case 3: struct length is smaller than child length but no offset
709        let offset_int_data = int_data.slice(1, 4);
710        let case3 = ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()])))
711            .len(3)
712            .nulls(Some(struct_nulls))
713            .add_child_data(offset_int_data)
714            .build()
715            .unwrap();
716
717        let expected = StructArray::new(
718            Fields::from(vec![int_field.clone()]),
719            vec![Arc::new(int_arr)],
720            Some(NullBuffer::new(BooleanBuffer::from(vec![
721                true, true, true, false, true,
722            ]))),
723        )
724        .slice(1, 3);
725
726        for case in [case1, case2, case3] {
727            let struct_arr_from_data = StructArray::from(case);
728            assert_eq!(struct_arr_from_data, expected);
729            assert_eq!(struct_arr_from_data.column(0), expected.column(0));
730        }
731    }
732
733    #[test]
734    #[should_panic(expected = "assertion failed: end <= self.len()")]
735    fn test_struct_array_from_data_with_offset_and_length_error() {
736        let int_arr = Int32Array::from(vec![1, 2, 3, 4, 5]);
737        let int_field = Field::new("x", DataType::Int32, false);
738        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, true, false]));
739        let int_data = int_arr.to_data();
740        // If parent offset is 3 and len is 3 then child must have 6 items
741        let struct_data =
742            ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()])))
743                .len(3)
744                .offset(3)
745                .nulls(Some(struct_nulls))
746                .add_child_data(int_data)
747                .build()
748                .unwrap();
749        let _ = StructArray::from(struct_data);
750    }
751
752    /// validates that struct can be accessed using `column_name` as index i.e. `struct_array["column_name"]`.
753    #[test]
754    fn test_struct_array_index_access() {
755        let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
756        let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
757
758        let struct_array = StructArray::from(vec![
759            (
760                Arc::new(Field::new("b", DataType::Boolean, false)),
761                boolean.clone() as ArrayRef,
762            ),
763            (
764                Arc::new(Field::new("c", DataType::Int32, false)),
765                int.clone() as ArrayRef,
766            ),
767        ]);
768        assert_eq!(struct_array["b"].as_ref(), boolean.as_ref());
769        assert_eq!(struct_array["c"].as_ref(), int.as_ref());
770    }
771
772    #[test]
773    fn test_struct_array_field_access() {
774        let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
775        let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
776
777        let b_field = Arc::new(Field::new("b", DataType::Boolean, false));
778        let c_field = Arc::new(Field::new("c", DataType::Int32, false));
779
780        let struct_array = StructArray::from(vec![
781            (b_field.clone(), boolean as ArrayRef),
782            (c_field.clone(), int as ArrayRef),
783        ]);
784
785        assert_eq!(struct_array.field(0), &b_field);
786        assert_eq!(struct_array.field(1), &c_field);
787
788        assert_eq!(struct_array.field_by_name("b"), Some(&b_field));
789        assert_eq!(struct_array.field_by_name("c"), Some(&c_field));
790        assert_eq!(struct_array.field_by_name("d"), None);
791    }
792
793    /// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout)
794    #[test]
795    fn test_struct_array_from_vec() {
796        let strings: ArrayRef = Arc::new(StringArray::from(vec![
797            Some("joe"),
798            None,
799            None,
800            Some("mark"),
801        ]));
802        let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
803
804        let arr =
805            StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]).unwrap();
806
807        let struct_data = arr.into_data();
808        assert_eq!(4, struct_data.len());
809        assert_eq!(0, struct_data.null_count());
810
811        let expected_string_data = ArrayData::builder(DataType::Utf8)
812            .len(4)
813            .null_bit_buffer(Some(Buffer::from(&[9_u8])))
814            .add_buffer(Buffer::from([0, 3, 3, 3, 7].to_byte_slice()))
815            .add_buffer(Buffer::from(b"joemark"))
816            .build()
817            .unwrap();
818
819        let expected_int_data = ArrayData::builder(DataType::Int32)
820            .len(4)
821            .null_bit_buffer(Some(Buffer::from(&[11_u8])))
822            .add_buffer(Buffer::from([1, 2, 0, 4].to_byte_slice()))
823            .build()
824            .unwrap();
825
826        assert_eq!(expected_string_data, struct_data.child_data()[0]);
827        assert_eq!(expected_int_data, struct_data.child_data()[1]);
828    }
829
830    #[test]
831    fn test_struct_array_from_vec_error() {
832        let strings: ArrayRef = Arc::new(StringArray::from(vec![
833            Some("joe"),
834            None,
835            None,
836            // 3 elements, not 4
837        ]));
838        let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
839
840        let err = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())])
841            .unwrap_err()
842            .to_string();
843
844        assert_eq!(
845            err,
846            "Invalid argument error: Incorrect array length for StructArray field \"f2\", expected 3 got 4"
847        )
848    }
849
850    #[test]
851    #[should_panic(
852        expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean"
853    )]
854    fn test_struct_array_from_mismatched_types_single() {
855        drop(StructArray::from(vec![(
856            Arc::new(Field::new("b", DataType::Int16, false)),
857            Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>,
858        )]));
859    }
860
861    #[test]
862    #[should_panic(
863        expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean"
864    )]
865    fn test_struct_array_from_mismatched_types_multiple() {
866        drop(StructArray::from(vec![
867            (
868                Arc::new(Field::new("b", DataType::Int16, false)),
869                Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>,
870            ),
871            (
872                Arc::new(Field::new("c", DataType::Utf8, false)),
873                Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
874            ),
875        ]));
876    }
877
878    #[test]
879    fn test_struct_array_slice() {
880        let boolean_data = ArrayData::builder(DataType::Boolean)
881            .len(5)
882            .add_buffer(Buffer::from([0b00010000]))
883            .null_bit_buffer(Some(Buffer::from([0b00010001])))
884            .build()
885            .unwrap();
886        let int_data = ArrayData::builder(DataType::Int32)
887            .len(5)
888            .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice()))
889            .null_bit_buffer(Some(Buffer::from([0b00000110])))
890            .build()
891            .unwrap();
892
893        let field_types = vec![
894            Field::new("a", DataType::Boolean, true),
895            Field::new("b", DataType::Int32, true),
896        ];
897        let struct_array_data = ArrayData::builder(DataType::Struct(field_types.into()))
898            .len(5)
899            .add_child_data(boolean_data.clone())
900            .add_child_data(int_data.clone())
901            .null_bit_buffer(Some(Buffer::from([0b00010111])))
902            .build()
903            .unwrap();
904        let struct_array = StructArray::from(struct_array_data);
905
906        assert_eq!(5, struct_array.len());
907        assert_eq!(1, struct_array.null_count());
908        assert!(struct_array.is_valid(0));
909        assert!(struct_array.is_valid(1));
910        assert!(struct_array.is_valid(2));
911        assert!(struct_array.is_null(3));
912        assert!(struct_array.is_valid(4));
913        assert_eq!(boolean_data, struct_array.column(0).to_data());
914        assert_eq!(int_data, struct_array.column(1).to_data());
915
916        let c0 = struct_array.column(0);
917        let c0 = c0.as_any().downcast_ref::<BooleanArray>().unwrap();
918        assert_eq!(5, c0.len());
919        assert_eq!(3, c0.null_count());
920        assert!(c0.is_valid(0));
921        assert!(!c0.value(0));
922        assert!(c0.is_null(1));
923        assert!(c0.is_null(2));
924        assert!(c0.is_null(3));
925        assert!(c0.is_valid(4));
926        assert!(c0.value(4));
927
928        let c1 = struct_array.column(1);
929        let c1 = c1.as_any().downcast_ref::<Int32Array>().unwrap();
930        assert_eq!(5, c1.len());
931        assert_eq!(3, c1.null_count());
932        assert!(c1.is_null(0));
933        assert!(c1.is_valid(1));
934        assert_eq!(28, c1.value(1));
935        assert!(c1.is_valid(2));
936        assert_eq!(42, c1.value(2));
937        assert!(c1.is_null(3));
938        assert!(c1.is_null(4));
939
940        let sliced_array = struct_array.slice(2, 3);
941        let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
942        assert_eq!(3, sliced_array.len());
943        assert_eq!(1, sliced_array.null_count());
944        assert!(sliced_array.is_valid(0));
945        assert!(sliced_array.is_null(1));
946        assert!(sliced_array.is_valid(2));
947
948        let sliced_c0 = sliced_array.column(0);
949        let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
950        assert_eq!(3, sliced_c0.len());
951        assert!(sliced_c0.is_null(0));
952        assert!(sliced_c0.is_null(1));
953        assert!(sliced_c0.is_valid(2));
954        assert!(sliced_c0.value(2));
955
956        let sliced_c1 = sliced_array.column(1);
957        let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
958        assert_eq!(3, sliced_c1.len());
959        assert!(sliced_c1.is_valid(0));
960        assert_eq!(42, sliced_c1.value(0));
961        assert!(sliced_c1.is_null(1));
962        assert!(sliced_c1.is_null(2));
963    }
964
965    #[test]
966    #[should_panic(
967        expected = "Incorrect array length for StructArray field \\\"c\\\", expected 1 got 2"
968    )]
969    fn test_invalid_struct_child_array_lengths() {
970        drop(StructArray::from(vec![
971            (
972                Arc::new(Field::new("b", DataType::Float32, false)),
973                Arc::new(Float32Array::from(vec![1.1])) as Arc<dyn Array>,
974            ),
975            (
976                Arc::new(Field::new("c", DataType::Float64, false)),
977                Arc::new(Float64Array::from(vec![2.2, 3.3])),
978            ),
979        ]));
980    }
981
982    #[test]
983    #[should_panic(expected = "use StructArray::try_new_with_length")]
984    fn test_struct_array_from_empty() {
985        // This can't work because we don't know how many rows the array should have.  Previously we inferred 0 but
986        // that often led to bugs.
987        let _ = StructArray::from(vec![]);
988    }
989
990    #[test]
991    fn test_empty_struct_array() {
992        assert!(StructArray::try_new(Fields::empty(), vec![], None).is_err());
993
994        let arr = StructArray::new_empty_fields(10, None);
995        assert_eq!(arr.len(), 10);
996        assert_eq!(arr.null_count(), 0);
997        assert_eq!(arr.num_columns(), 0);
998
999        let arr2 = StructArray::try_new_with_length(Fields::empty(), vec![], None, 10).unwrap();
1000        assert_eq!(arr2.len(), 10);
1001
1002        let arr = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10)));
1003        assert_eq!(arr.len(), 10);
1004        assert_eq!(arr.null_count(), 10);
1005        assert_eq!(arr.num_columns(), 0);
1006
1007        let arr2 = StructArray::try_new_with_length(
1008            Fields::empty(),
1009            vec![],
1010            Some(NullBuffer::new_null(10)),
1011            10,
1012        )
1013        .unwrap();
1014        assert_eq!(arr2.len(), 10);
1015    }
1016
1017    #[test]
1018    #[should_panic(expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"")]
1019    fn test_struct_array_from_mismatched_nullability() {
1020        drop(StructArray::from(vec![(
1021            Arc::new(Field::new("c", DataType::Int32, false)),
1022            Arc::new(Int32Array::from(vec![Some(42), None, Some(19)])) as ArrayRef,
1023        )]));
1024    }
1025
1026    #[test]
1027    fn test_struct_array_fmt_debug() {
1028        let arr: StructArray = StructArray::new(
1029            vec![Arc::new(Field::new("c", DataType::Int32, true))].into(),
1030            vec![Arc::new(Int32Array::from((0..30).collect::<Vec<_>>())) as ArrayRef],
1031            Some(NullBuffer::new(BooleanBuffer::from(
1032                (0..30).map(|i| i % 2 == 0).collect::<Vec<_>>(),
1033            ))),
1034        );
1035        assert_eq!(
1036            format!("{arr:?}"),
1037            "StructArray\n-- validity:\n[\n  valid,\n  null,\n  valid,\n  null,\n  valid,\n  null,\n  valid,\n  null,\n  valid,\n  null,\n  ...10 elements...,\n  valid,\n  null,\n  valid,\n  null,\n  valid,\n  null,\n  valid,\n  null,\n  valid,\n  null,\n]\n[\n-- child 0: \"c\" (Int32)\nPrimitiveArray<Int32>\n[\n  0,\n  1,\n  2,\n  3,\n  4,\n  5,\n  6,\n  7,\n  8,\n  9,\n  ...10 elements...,\n  20,\n  21,\n  22,\n  23,\n  24,\n  25,\n  26,\n  27,\n  28,\n  29,\n]\n]"
1038        )
1039    }
1040
1041    #[test]
1042    fn test_struct_array_logical_nulls() {
1043        // Field is non-nullable
1044        let field = Field::new("a", DataType::Int32, false);
1045        let values = vec![1, 2, 3];
1046        // Create a NullBuffer with all bits set to valid (true)
1047        let nulls = NullBuffer::from(vec![true, true, true]);
1048        let array = Int32Array::new(values.into(), Some(nulls));
1049        let child = Arc::new(array) as ArrayRef;
1050        assert!(child.logical_nulls().is_some());
1051        assert_eq!(child.logical_nulls().unwrap().null_count(), 0);
1052
1053        let fields = Fields::from(vec![field]);
1054        let arrays = vec![child];
1055        let nulls = None;
1056
1057        StructArray::try_new(fields, arrays, nulls).expect("should not error");
1058    }
1059
1060    #[test]
1061    fn test_flatten_no_nulls() {
1062        let child = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
1063        let sa = StructArray::from(vec![(
1064            Arc::new(Field::new("a", DataType::Int32, false)),
1065            child,
1066        )]);
1067
1068        let (fields, columns) = sa.flatten();
1069
1070        assert_eq!(columns.len(), 1);
1071        assert!(!fields[0].is_nullable());
1072        assert_eq!(columns[0].null_count(), 0);
1073        assert_eq!(columns[0].len(), 3);
1074    }
1075
1076    #[test]
1077    fn test_flatten_struct_nulls_child_no_nulls() {
1078        let child = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
1079        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
1080        let sa = StructArray::new(
1081            Fields::from(vec![Field::new("a", DataType::Int32, false)]),
1082            vec![child],
1083            Some(struct_nulls),
1084        );
1085
1086        let (fields, columns) = sa.flatten();
1087
1088        assert!(fields[0].is_nullable());
1089        assert!(columns[0].is_valid(0));
1090        assert!(columns[0].is_null(1));
1091        assert!(columns[0].is_valid(2));
1092        assert_eq!(columns[0].null_count(), 1);
1093    }
1094
1095    #[test]
1096    fn test_flatten_both_have_nulls() {
1097        // struct validity: [valid, null,  valid, valid]
1098        // child validity:  [valid, valid, null,  valid]
1099        // expected:        [valid, null,  null,  valid]
1100        let child = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])) as ArrayRef;
1101        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, false, true, true]));
1102        let sa = StructArray::new(
1103            Fields::from(vec![Field::new("a", DataType::Int32, true)]),
1104            vec![child],
1105            Some(struct_nulls),
1106        );
1107
1108        let (fields, columns) = sa.flatten();
1109
1110        assert!(fields[0].is_nullable());
1111        assert!(columns[0].is_valid(0));
1112        assert!(columns[0].is_null(1));
1113        assert!(columns[0].is_null(2));
1114        assert!(columns[0].is_valid(3));
1115        assert_eq!(columns[0].null_count(), 2);
1116    }
1117
1118    #[test]
1119    fn test_flatten_sliced_struct() {
1120        let child = Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as ArrayRef;
1121        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, false, true, false]));
1122        let sa = StructArray::new(
1123            Fields::from(vec![Field::new("a", DataType::Int32, false)]),
1124            vec![child],
1125            Some(struct_nulls),
1126        );
1127        let sliced = sa.slice(1, 2);
1128
1129        let (fields, columns) = sliced.flatten();
1130
1131        assert!(fields[0].is_nullable());
1132        assert_eq!(columns[0].len(), 2);
1133        assert!(columns[0].is_null(0));
1134        assert!(columns[0].is_valid(1));
1135    }
1136
1137    #[test]
1138    fn test_flatten_multiple_children() {
1139        let int_child = Arc::new(Int32Array::from(vec![Some(1), Some(2), None])) as ArrayRef;
1140        let str_child = Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])) as ArrayRef;
1141        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
1142        let sa = StructArray::new(
1143            Fields::from(vec![
1144                Field::new("ints", DataType::Int32, true),
1145                Field::new("strs", DataType::Utf8, true),
1146            ]),
1147            vec![int_child, str_child],
1148            Some(struct_nulls),
1149        );
1150
1151        let (fields, columns) = sa.flatten();
1152
1153        assert_eq!(fields.len(), 2);
1154        // int: [valid, null(struct), null(child)] => null_count=2
1155        assert_eq!(columns[0].null_count(), 2);
1156        assert!(columns[0].is_valid(0));
1157        assert!(columns[0].is_null(1));
1158        assert!(columns[0].is_null(2));
1159        // str: [valid, null(struct+child), valid] => null_count=1
1160        assert_eq!(columns[1].null_count(), 1);
1161        assert!(columns[1].is_valid(0));
1162        assert!(columns[1].is_null(1));
1163        assert!(columns[1].is_valid(2));
1164    }
1165
1166    #[test]
1167    fn test_flatten_empty_struct() {
1168        let sa = StructArray::new_empty_fields(5, Some(NullBuffer::new_null(5)));
1169
1170        let (fields, columns) = sa.flatten();
1171
1172        assert_eq!(fields.len(), 0);
1173        assert_eq!(columns.len(), 0);
1174    }
1175
1176    #[test]
1177    fn test_flatten_field_nullability_update() {
1178        let non_null_child = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
1179        let nullable_child = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as ArrayRef;
1180        let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, true, false]));
1181        let sa = StructArray::new(
1182            Fields::from(vec![
1183                Field::new("non_null", DataType::Int32, false),
1184                Field::new("nullable", DataType::Int32, true),
1185            ]),
1186            vec![non_null_child, nullable_child],
1187            Some(struct_nulls),
1188        );
1189
1190        let (fields, _columns) = sa.flatten();
1191
1192        assert!(fields[0].is_nullable()); // was false, now true
1193        assert!(fields[1].is_nullable()); // was true, stays true
1194    }
1195}