Skip to main content

parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22    generic_conversion_single_value, generic_conversion_single_value_with_result,
23    primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29    Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31    TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::ExtensionType;
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38    Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44/// Arrow Variant [`ExtensionType`].
45///
46/// Represents the canonical Arrow Extension Type for storing variants.
47/// See [`VariantArray`] for more examples of using this extension type.
48pub struct VariantType;
49
50impl ExtensionType for VariantType {
51    const NAME: &'static str = "arrow.parquet.variant";
52
53    // Variants extension metadata is an empty string
54    // <https://github.com/apache/arrow/blob/d803afcc43f5d132506318fd9e162d33b2c3d4cd/docs/source/format/CanonicalExtensions.rst?plain=1#L473>
55    type Metadata = &'static str;
56
57    fn metadata(&self) -> &Self::Metadata {
58        &""
59    }
60
61    fn serialize_metadata(&self) -> Option<String> {
62        Some(String::new())
63    }
64
65    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
66        Ok("")
67    }
68
69    fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
70        if matches!(data_type, DataType::Struct(_)) {
71            Ok(())
72        } else {
73            Err(ArrowError::InvalidArgumentError(format!(
74                "VariantType only supports StructArray, got {data_type}"
75            )))
76        }
77    }
78
79    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
80        Self.supports_data_type(data_type)?;
81        Ok(Self)
82    }
83}
84
85/// An array of Parquet [`Variant`] values
86///
87/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
88/// `metadata` and `value` fields, and adds convenience methods to access
89/// the [`Variant`]s.
90///
91/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row.
92///
93/// See the examples below from converting between `VariantArray` and
94/// `StructArray`.
95///
96/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
97///
98/// # Documentation
99///
100/// At the time of this writing, Variant has been accepted as an official
101/// extension type but not been published to the [official list of extension
102/// types] on the Apache Arrow website. See the [Extension Type for Parquet
103/// Variant arrow] ticket for more details.
104///
105/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
106/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
107///
108/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension
109///
110/// Arrow Arrays only provide [`DataType`], but the extension type information
111/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or
112/// [`Field`] to check for the extension type.
113///
114/// [`Schema`]: arrow_schema::Schema
115/// ```
116/// # use arrow::array::StructArray;
117/// # use arrow_schema::{Schema, Field, DataType};
118/// # use parquet_variant::Variant;
119/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
120/// # fn get_variant_array() -> VariantArray {
121/// #   let mut builder = VariantArrayBuilder::new(10);
122/// #   builder.append_variant(Variant::from("such wow"));
123/// #   builder.build()
124/// # }
125/// # fn get_schema() -> Schema {
126/// #   Schema::new(vec![
127/// #     Field::new("id", DataType::Int32, false),
128/// #     get_variant_array().field("var"),
129/// #   ])
130/// # }
131/// let schema = get_schema();
132/// assert_eq!(schema.fields().len(), 2);
133/// // first field is not a Variant
134/// assert!(schema.field(0).try_extension_type::<VariantType>().is_err());
135/// // second field is a Variant
136/// assert!(schema.field(1).try_extension_type::<VariantType>().is_ok());
137/// ```
138///
139/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
140///
141/// You can construct the correct [`Field`] for a [`VariantArray`] using the
142/// [`VariantArray::field`] method.
143///
144/// ```
145/// # use arrow_schema::{Schema, Field, DataType};
146/// # use parquet_variant::Variant;
147/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
148/// # fn get_variant_array() -> VariantArray {
149/// #   let mut builder = VariantArrayBuilder::new(10);
150/// #   builder.append_variant(Variant::from("such wow"));
151/// #   builder.build()
152/// # }
153/// let variant_array = get_variant_array();
154/// // First field is an integer id, second field is a variant
155/// let schema = Schema::new(vec![
156///   Field::new("id", DataType::Int32, false),
157///   // call VariantArray::field to get the correct Field
158///   variant_array.field("var"),
159/// ]);
160/// ```
161///
162/// You can also construct the [`Field`] using [`VariantType`] directly
163///
164/// ```
165/// # use arrow_schema::{Schema, Field, DataType};
166/// # use parquet_variant::Variant;
167/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
168/// # fn get_variant_array() -> VariantArray {
169/// #   let mut builder = VariantArrayBuilder::new(10);
170/// #   builder.append_variant(Variant::from("such wow"));
171/// #   builder.build()
172/// # }
173/// # let variant_array = get_variant_array();
174/// // The DataType of a VariantArray varies depending on how it is shredded
175/// let data_type = variant_array.data_type().clone();
176/// // First field is an integer id, second field is a variant
177/// let schema = Schema::new(vec![
178///   Field::new("id", DataType::Int32, false),
179///   Field::new("var", data_type, false)
180///     // Add extension metadata to the field using `VariantType`
181///     .with_extension_type(VariantType),
182/// ]);
183/// ```
184///
185/// # Example: Converting a [`VariantArray`] to a [`StructArray`]
186///
187/// ```
188/// # use arrow::array::StructArray;
189/// # use parquet_variant::Variant;
190/// # use parquet_variant_compute::VariantArrayBuilder;
191/// // Create Variant Array
192/// let mut builder = VariantArrayBuilder::new(10);
193/// builder.append_variant(Variant::from("such wow"));
194/// let variant_array = builder.build();
195/// // convert to StructArray
196/// let struct_array: StructArray = variant_array.into();
197/// ```
198///
199/// # Example: Converting a [`StructArray`] to a [`VariantArray`]
200///
201/// ```
202/// # use arrow::array::StructArray;
203/// # use parquet_variant::Variant;
204/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray};
205/// # fn get_struct_array() -> StructArray {
206/// #   let mut builder = VariantArrayBuilder::new(10);
207/// #   builder.append_variant(Variant::from("such wow"));
208/// #   builder.build().into()
209/// # }
210/// let struct_array: StructArray = get_struct_array();
211/// // try and create a VariantArray from it
212/// let variant_array = VariantArray::try_new(&struct_array).unwrap();
213/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
214/// ```
215///
216#[derive(Debug, Clone, PartialEq)]
217pub struct VariantArray {
218    /// Reference to the underlying StructArray
219    inner: StructArray,
220
221    /// The metadata column of this variant
222    metadata: BinaryViewArray,
223
224    /// how is this variant array shredded?
225    shredding_state: ShreddingState,
226}
227
228impl VariantArray {
229    /// Creates a new `VariantArray` from a [`StructArray`].
230    ///
231    /// # Arguments
232    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
233    ///
234    /// # Returns
235    /// - A new instance of `VariantArray`.
236    ///
237    /// # Errors:
238    /// - If the `StructArray` does not contain the required fields
239    ///
240    /// # Requirements of the `StructArray`
241    ///
242    /// 1. A required field named `metadata` which is binary, large_binary, or
243    ///    binary_view
244    ///
245    /// 2. An optional field named `value` that is binary, large_binary, or
246    ///    binary_view
247    ///
248    /// 3. An optional field named `typed_value` which can be any primitive type
249    ///    or be a list, large_list, list_view or struct
250    ///
251    /// NOTE: It is also permissible for the metadata field to be
252    /// Dictionary-Encoded, preferably (but not required) with an index type of
253    /// int8.
254    ///
255    /// Currently, only [`BinaryViewArray`] are supported.
256    pub fn try_new(inner: &dyn Array) -> Result<Self> {
257        // Workaround lack of support for Binary
258        // https://github.com/apache/arrow-rs/issues/8387
259        let inner = cast_to_binary_view_arrays(inner)?;
260
261        let Some(inner) = inner.as_struct_opt() else {
262            return Err(ArrowError::InvalidArgumentError(
263                "Invalid VariantArray: requires StructArray as input".to_string(),
264            ));
265        };
266
267        // Note the specification allows for any order so we must search by name
268
269        // Ensure the StructArray has a metadata field of BinaryView
270        let Some(metadata_field) = inner.column_by_name("metadata") else {
271            return Err(ArrowError::InvalidArgumentError(
272                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
273            ));
274        };
275        let Some(metadata) = metadata_field.as_binary_view_opt() else {
276            return Err(ArrowError::NotYetImplemented(format!(
277                "VariantArray 'metadata' field must be BinaryView, got {}",
278                metadata_field.data_type()
279            )));
280        };
281
282        // Note these clones are cheap, they just bump the ref count
283        Ok(Self {
284            inner: inner.clone(),
285            metadata: metadata.clone(),
286            shredding_state: ShreddingState::try_from(inner)?,
287        })
288    }
289
290    pub(crate) fn from_parts(
291        metadata: BinaryViewArray,
292        value: Option<BinaryViewArray>,
293        typed_value: Option<ArrayRef>,
294        nulls: Option<NullBuffer>,
295    ) -> Self {
296        let mut builder =
297            StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false);
298        if let Some(value) = value.clone() {
299            builder = builder.with_field("value", Arc::new(value), true);
300        }
301        if let Some(typed_value) = typed_value.clone() {
302            builder = builder.with_field("typed_value", typed_value, true);
303        }
304        if let Some(nulls) = nulls {
305            builder = builder.with_nulls(nulls);
306        }
307
308        Self {
309            inner: builder.build(),
310            metadata,
311            shredding_state: ShreddingState::new(value, typed_value),
312        }
313    }
314
315    /// Returns a reference to the underlying [`StructArray`].
316    pub fn inner(&self) -> &StructArray {
317        &self.inner
318    }
319
320    /// Returns the inner [`StructArray`], consuming self
321    pub fn into_inner(self) -> StructArray {
322        self.inner
323    }
324
325    /// Return the shredding state of this `VariantArray`
326    pub fn shredding_state(&self) -> &ShreddingState {
327        &self.shredding_state
328    }
329
330    /// Return the [`Variant`] instance stored at the given row
331    ///
332    /// This is a convenience wrapper that calls [`VariantArray::try_value`] and unwraps the `Result`.
333    /// Use `try_value` if you need to handle conversion errors gracefully.
334    ///
335    /// # Panics
336    /// * if the index is out of bounds
337    /// * if the array value is null
338    /// * if `try_value` returns an error.
339    pub fn value(&self, index: usize) -> Variant<'_, '_> {
340        self.try_value(index).unwrap()
341    }
342
343    /// Return the [`Variant`] instance stored at the given row
344    ///
345    /// Note: This method does not check for nulls and the value is arbitrary
346    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
347    ///
348    /// # Panics
349    ///
350    /// Panics if
351    /// * the index is out of bounds
352    /// * the array value is null
353    ///
354    /// # Errors
355    ///
356    /// Errors if
357    /// - the data in `typed_value` cannot be interpreted as a valid `Variant`
358    ///
359    /// If this is a shredded variant but has no value at the shredded location, it
360    /// will return [`Variant::Null`].
361    ///
362    ///
363    /// # Performance Note
364    ///
365    /// This is certainly not the most efficient way to access values in a
366    /// `VariantArray`, but it is useful for testing and debugging.
367    ///
368    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
369    /// caller to ensure that the metadata and value were constructed correctly.
370    pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
371        match (self.typed_value_field(), self.value_field()) {
372            // Always prefer typed_value, if available
373            (Some(typed_value), value) if typed_value.is_valid(index) => {
374                typed_value_to_variant(typed_value, value, index)
375            }
376            // Otherwise fall back to value, if available
377            (_, Some(value)) if value.is_valid(index) => {
378                Ok(Variant::new(self.metadata.value(index), value.value(index)))
379            }
380            // It is technically invalid for neither value nor typed_value fields to be available,
381            // but the spec specifically requires readers to return Variant::Null in this case.
382            _ => Ok(Variant::Null),
383        }
384    }
385
386    /// Return a reference to the metadata field of the [`StructArray`]
387    pub fn metadata_field(&self) -> &BinaryViewArray {
388        &self.metadata
389    }
390
391    /// Return a reference to the value field of the `StructArray`
392    pub fn value_field(&self) -> Option<&BinaryViewArray> {
393        self.shredding_state.value_field()
394    }
395
396    /// Return a reference to the typed_value field of the `StructArray`, if present
397    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
398        self.shredding_state.typed_value_field()
399    }
400
401    /// Return a field to represent this VariantArray in a `Schema` with
402    /// a particular name
403    pub fn field(&self, name: impl Into<String>) -> Field {
404        Field::new(
405            name.into(),
406            self.data_type().clone(),
407            self.inner.is_nullable(),
408        )
409        .with_extension_type(VariantType)
410    }
411
412    /// Returns a new DataType representing this VariantArray's inner type
413    pub fn data_type(&self) -> &DataType {
414        self.inner.data_type()
415    }
416
417    pub fn slice(&self, offset: usize, length: usize) -> Self {
418        let inner = self.inner.slice(offset, length);
419        let metadata = self.metadata.slice(offset, length);
420        let shredding_state = self.shredding_state.slice(offset, length);
421        Self {
422            inner,
423            metadata,
424            shredding_state,
425        }
426    }
427
428    pub fn len(&self) -> usize {
429        self.inner.len()
430    }
431
432    pub fn is_empty(&self) -> bool {
433        self.inner.is_empty()
434    }
435
436    pub fn nulls(&self) -> Option<&NullBuffer> {
437        self.inner.nulls()
438    }
439
440    /// Is the element at index null?
441    pub fn is_null(&self, index: usize) -> bool {
442        self.nulls().is_some_and(|n| n.is_null(index))
443    }
444
445    /// Is the element at index valid (not null)?
446    pub fn is_valid(&self, index: usize) -> bool {
447        !self.is_null(index)
448    }
449
450    /// Returns an iterator over the values in this array
451    pub fn iter(&self) -> VariantArrayIter<'_> {
452        VariantArrayIter::new(self)
453    }
454}
455
456impl From<VariantArray> for StructArray {
457    fn from(variant_array: VariantArray) -> Self {
458        variant_array.into_inner()
459    }
460}
461
462impl From<VariantArray> for ArrayRef {
463    fn from(variant_array: VariantArray) -> Self {
464        Arc::new(variant_array.into_inner())
465    }
466}
467
468impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
469    fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
470        let iter = iter.into_iter();
471
472        let mut b = VariantArrayBuilder::new(iter.size_hint().0);
473        b.extend(iter);
474        b.build()
475    }
476}
477
478impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
479    fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
480        Self::from_iter(iter.into_iter().map(Some))
481    }
482}
483
484/// An iterator over [`VariantArray`]
485///
486/// This iterator returns `Option<Option<Variant<'a, 'a>>>` where:
487/// - `None` indicates the end of iteration
488/// - `Some(None)` indicates a null value at this position
489/// - `Some(Some(variant))` indicates a valid variant value
490///
491/// # Example
492///
493/// ```
494/// # use parquet_variant::Variant;
495/// # use parquet_variant_compute::VariantArrayBuilder;
496/// let mut builder = VariantArrayBuilder::new(10);
497/// builder.append_variant(Variant::from(42));
498/// builder.append_null();
499/// builder.append_variant(Variant::from("hello"));
500/// let array = builder.build();
501///
502/// let values = array.iter().collect::<Vec<_>>();
503/// assert_eq!(values.len(), 3);
504/// assert_eq!(values[0], Some(Variant::from(42)));
505/// assert_eq!(values[1], None);
506/// assert_eq!(values[2], Some(Variant::from("hello")));
507/// ```
508#[derive(Debug)]
509pub struct VariantArrayIter<'a> {
510    array: &'a VariantArray,
511    head_i: usize,
512    tail_i: usize,
513}
514
515impl<'a> VariantArrayIter<'a> {
516    /// Creates a new iterator over the given [`VariantArray`]
517    pub fn new(array: &'a VariantArray) -> Self {
518        Self {
519            array,
520            head_i: 0,
521            tail_i: array.len(),
522        }
523    }
524
525    fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
526        self.array.is_valid(i).then(|| self.array.value(i))
527    }
528}
529
530impl<'a> Iterator for VariantArrayIter<'a> {
531    type Item = Option<Variant<'a, 'a>>;
532
533    #[inline]
534    fn next(&mut self) -> Option<Self::Item> {
535        if self.head_i == self.tail_i {
536            return None;
537        }
538
539        let out = self.value_opt(self.head_i);
540
541        self.head_i += 1;
542
543        Some(out)
544    }
545
546    fn size_hint(&self) -> (usize, Option<usize>) {
547        let remainder = self.tail_i - self.head_i;
548
549        (remainder, Some(remainder))
550    }
551}
552
553impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
554    fn next_back(&mut self) -> Option<Self::Item> {
555        if self.head_i == self.tail_i {
556            return None;
557        }
558
559        self.tail_i -= 1;
560
561        Some(self.value_opt(self.tail_i))
562    }
563}
564
565impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
566
567/// One shredded field of a partially or perfectly shredded variant. For example, suppose the
568/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
569/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
570/// is:
571///
572/// ```text
573/// v: VARIANT {
574///     metadata: BINARY,
575///     value: BINARY,
576///     typed_value: STRUCT {
577///         a: SHREDDED_VARIANT_FIELD {
578///             value: BINARY,
579///             typed_value: STRUCT {
580///                 a: SHREDDED_VARIANT_FIELD {
581///                     value: BINARY,
582///                     typed_value: INT,
583///                 },
584///             },
585///         },
586///     },
587/// }
588/// ```
589///
590/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
591/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
592/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
593/// single expected field `a`).
594///
595/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
596/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
597/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
598///
599/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
600/// variant value (which could be `Variant::Null`).
601#[derive(Debug)]
602pub struct ShreddedVariantFieldArray {
603    /// Reference to the underlying StructArray
604    inner: StructArray,
605    shredding_state: ShreddingState,
606}
607
608#[allow(unused)]
609impl ShreddedVariantFieldArray {
610    /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
611    ///
612    /// # Arguments
613    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
614    ///
615    /// # Returns
616    /// - A new instance of `ShreddedVariantFieldArray`.
617    ///
618    /// # Errors:
619    /// - If the `StructArray` does not contain the required fields
620    ///
621    /// # Requirements of the `StructArray`
622    ///
623    /// 1. An optional field named `value` that is binary, large_binary, or
624    ///    binary_view
625    ///
626    /// 2. An optional field named `typed_value` which can be any primitive type
627    ///    or be a list, large_list, list_view or struct
628    ///
629    /// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
630    pub fn try_new(inner: &dyn Array) -> Result<Self> {
631        let Some(inner_struct) = inner.as_struct_opt() else {
632            return Err(ArrowError::InvalidArgumentError(
633                "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
634            ));
635        };
636
637        // Note this clone is cheap, it just bumps the ref count
638        Ok(Self {
639            inner: inner_struct.clone(),
640            shredding_state: ShreddingState::try_from(inner_struct)?,
641        })
642    }
643
644    /// Return the shredding state of this `VariantArray`
645    pub fn shredding_state(&self) -> &ShreddingState {
646        &self.shredding_state
647    }
648
649    /// Return a reference to the value field of the `StructArray`
650    pub fn value_field(&self) -> Option<&BinaryViewArray> {
651        self.shredding_state.value_field()
652    }
653
654    /// Return a reference to the typed_value field of the `StructArray`, if present
655    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
656        self.shredding_state.typed_value_field()
657    }
658
659    /// Returns a reference to the underlying [`StructArray`].
660    pub fn inner(&self) -> &StructArray {
661        &self.inner
662    }
663
664    pub(crate) fn from_parts(
665        value: Option<BinaryViewArray>,
666        typed_value: Option<ArrayRef>,
667        nulls: Option<NullBuffer>,
668    ) -> Self {
669        let mut builder = StructArrayBuilder::new();
670        if let Some(value) = value.clone() {
671            builder = builder.with_field("value", Arc::new(value), true);
672        }
673        if let Some(typed_value) = typed_value.clone() {
674            builder = builder.with_field("typed_value", typed_value, true);
675        }
676        if let Some(nulls) = nulls {
677            builder = builder.with_nulls(nulls);
678        }
679
680        Self {
681            inner: builder.build(),
682            shredding_state: ShreddingState::new(value, typed_value),
683        }
684    }
685
686    /// Returns the inner [`StructArray`], consuming self
687    pub fn into_inner(self) -> StructArray {
688        self.inner
689    }
690
691    pub fn data_type(&self) -> &DataType {
692        self.inner.data_type()
693    }
694
695    pub fn len(&self) -> usize {
696        self.inner.len()
697    }
698
699    pub fn is_empty(&self) -> bool {
700        self.inner.is_empty()
701    }
702
703    pub fn offset(&self) -> usize {
704        self.inner.offset()
705    }
706
707    pub fn nulls(&self) -> Option<&NullBuffer> {
708        // According to the shredding spec, ShreddedVariantFieldArray should be
709        // physically non-nullable - SQL NULL is inferred by both value and
710        // typed_value being physically NULL
711        None
712    }
713    /// Is the element at index null?
714    pub fn is_null(&self, index: usize) -> bool {
715        self.nulls().is_some_and(|n| n.is_null(index))
716    }
717
718    /// Is the element at index valid (not null)?
719    pub fn is_valid(&self, index: usize) -> bool {
720        !self.is_null(index)
721    }
722}
723
724impl From<ShreddedVariantFieldArray> for ArrayRef {
725    fn from(array: ShreddedVariantFieldArray) -> Self {
726        Arc::new(array.into_inner())
727    }
728}
729
730impl From<ShreddedVariantFieldArray> for StructArray {
731    fn from(array: ShreddedVariantFieldArray) -> Self {
732        array.into_inner()
733    }
734}
735
736/// Represents the shredding state of a [`VariantArray`]
737///
738/// [`VariantArray`]s can be shredded according to the [Parquet Variant
739/// Shredding Spec]. Shredding means that the actual value is stored in a typed
740/// `typed_field` instead of the generic `value` field.
741///
742/// Both value and typed_value are optional fields used together to encode a
743/// single value. Values in the two fields must be interpreted according to the
744/// following table (see [Parquet Variant Shredding Spec] for more details):
745///
746/// | value    | typed_value  | Meaning |
747/// |----------|--------------|---------|
748/// | NULL     | NULL         | The value is missing; only valid for shredded object fields |
749/// | non-NULL | NULL         | The value is present and may be any type, including [`Variant::Null`] |
750/// | NULL     | non-NULL     | The value is present and is the shredded type |
751/// | non-NULL | non-NULL     | The value is present and is a partially shredded object |
752///
753///
754/// Applying the above rules to entire columns, we obtain the following:
755///
756/// | value  | typed_value  | Meaning |
757/// |--------|-------------|---------|
758/// | --     | --          | **Missing**: The value is always missing; only valid for shredded object fields |
759/// | exists | --          | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
760/// | --     | exists      | **Perfectly shredded**: If present, the value is always the shredded type |
761/// | exists | exists      | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
762///
763/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
764/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
765/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
766/// (partial shredding).
767///
768/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
769#[derive(Debug, Clone, PartialEq)]
770pub struct ShreddingState {
771    value: Option<BinaryViewArray>,
772    typed_value: Option<ArrayRef>,
773}
774
775impl ShreddingState {
776    /// Create a new `ShreddingState` from the given `value` and `typed_value` fields
777    ///
778    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
779    /// `ShreddingState::try_from(&struct_array)`, for example:
780    ///
781    /// ```no_run
782    /// # use arrow::array::StructArray;
783    /// # use parquet_variant_compute::ShreddingState;
784    /// # fn get_struct_array() -> StructArray {
785    /// #   unimplemented!()
786    /// # }
787    /// let struct_array: StructArray = get_struct_array();
788    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
789    /// ```
790    pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
791        Self { value, typed_value }
792    }
793
794    /// Return a reference to the value field, if present
795    pub fn value_field(&self) -> Option<&BinaryViewArray> {
796        self.value.as_ref()
797    }
798
799    /// Return a reference to the typed_value field, if present
800    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
801        self.typed_value.as_ref()
802    }
803
804    /// Returns a borrowed version of this shredding state
805    pub fn borrow(&self) -> BorrowedShreddingState<'_> {
806        BorrowedShreddingState {
807            value: self.value_field(),
808            typed_value: self.typed_value_field(),
809        }
810    }
811
812    /// Slice all the underlying arrays
813    pub fn slice(&self, offset: usize, length: usize) -> Self {
814        Self {
815            value: self.value.as_ref().map(|v| v.slice(offset, length)),
816            typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
817        }
818    }
819}
820
821/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
822/// for avoiding clone operations when the caller does not need a self-standing shredding state.
823#[derive(Clone, Debug)]
824pub struct BorrowedShreddingState<'a> {
825    value: Option<&'a BinaryViewArray>,
826    typed_value: Option<&'a ArrayRef>,
827}
828
829impl<'a> BorrowedShreddingState<'a> {
830    /// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
831    ///
832    /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
833    /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
834    ///
835    /// ```no_run
836    /// # use arrow::array::StructArray;
837    /// # use parquet_variant_compute::BorrowedShreddingState;
838    /// # fn get_struct_array() -> StructArray {
839    /// #   unimplemented!()
840    /// # }
841    /// let struct_array: StructArray = get_struct_array();
842    /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
843    /// ```
844    pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
845        Self { value, typed_value }
846    }
847
848    /// Return a reference to the value field, if present
849    pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
850        self.value
851    }
852
853    /// Return a reference to the typed_value field, if present
854    pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
855        self.typed_value
856    }
857}
858
859impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
860    type Error = ArrowError;
861
862    fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
863        // The `value` column need not exist, but if it does it must be a binary view.
864        let value = if let Some(value_col) = inner_struct.column_by_name("value") {
865            let Some(binary_view) = value_col.as_binary_view_opt() else {
866                return Err(ArrowError::NotYetImplemented(format!(
867                    "VariantArray 'value' field must be BinaryView, got {}",
868                    value_col.data_type()
869                )));
870            };
871            Some(binary_view)
872        } else {
873            None
874        };
875        let typed_value = inner_struct.column_by_name("typed_value");
876        Ok(BorrowedShreddingState::new(value, typed_value))
877    }
878}
879
880impl TryFrom<&StructArray> for ShreddingState {
881    type Error = ArrowError;
882
883    fn try_from(inner_struct: &StructArray) -> Result<Self> {
884        Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
885    }
886}
887
888impl From<BorrowedShreddingState<'_>> for ShreddingState {
889    fn from(state: BorrowedShreddingState<'_>) -> Self {
890        ShreddingState {
891            value: state.value_field().cloned(),
892            typed_value: state.typed_value_field().cloned(),
893        }
894    }
895}
896
897/// Builds struct arrays from component fields
898///
899/// TODO: move to arrow crate
900#[derive(Debug, Default, Clone)]
901pub(crate) struct StructArrayBuilder {
902    fields: Vec<FieldRef>,
903    arrays: Vec<ArrayRef>,
904    nulls: Option<NullBuffer>,
905}
906
907impl StructArrayBuilder {
908    pub fn new() -> Self {
909        Default::default()
910    }
911
912    /// Add an array to this struct array as a field with the specified name.
913    pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
914        let field = Field::new(field_name, array.data_type().clone(), nullable);
915        self.fields.push(Arc::new(field));
916        self.arrays.push(array);
917        self
918    }
919
920    /// Set the null buffer for this struct array.
921    pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
922        self.nulls = Some(nulls);
923        self
924    }
925
926    pub fn build(self) -> StructArray {
927        let Self {
928            fields,
929            arrays,
930            nulls,
931        } = self;
932        StructArray::new(Fields::from(fields), arrays, nulls)
933    }
934}
935
936/// returns the non-null element at index as a Variant
937fn typed_value_to_variant<'a>(
938    typed_value: &'a ArrayRef,
939    value: Option<&BinaryViewArray>,
940    index: usize,
941) -> Result<Variant<'a, 'a>> {
942    let data_type = typed_value.data_type();
943    if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
944        // Only a partially shredded struct is allowed to have values for both columns
945        panic!("Invalid variant, conflicting value and typed_value");
946    }
947    match data_type {
948        DataType::Null => Ok(Variant::Null),
949        DataType::Boolean => {
950            let boolean_array = typed_value.as_boolean();
951            let value = boolean_array.value(index);
952            Ok(Variant::from(value))
953        }
954        // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
955        DataType::FixedSizeBinary(16) => {
956            let array = typed_value.as_fixed_size_binary();
957            let value = array.value(index);
958            Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes
959        }
960        DataType::BinaryView => {
961            let array = typed_value.as_binary_view();
962            let value = array.value(index);
963            Ok(Variant::from(value))
964        }
965        DataType::Utf8 => {
966            let array = typed_value.as_string::<i32>();
967            let value = array.value(index);
968            Ok(Variant::from(value))
969        }
970        DataType::LargeUtf8 => {
971            let array = typed_value.as_string::<i64>();
972            let value = array.value(index);
973            Ok(Variant::from(value))
974        }
975        DataType::Utf8View => {
976            let array = typed_value.as_string_view();
977            let value = array.value(index);
978            Ok(Variant::from(value))
979        }
980        DataType::Int8 => {
981            primitive_conversion_single_value!(Int8Type, typed_value, index)
982        }
983        DataType::Int16 => {
984            primitive_conversion_single_value!(Int16Type, typed_value, index)
985        }
986        DataType::Int32 => {
987            primitive_conversion_single_value!(Int32Type, typed_value, index)
988        }
989        DataType::Int64 => {
990            primitive_conversion_single_value!(Int64Type, typed_value, index)
991        }
992        DataType::Float16 => {
993            primitive_conversion_single_value!(Float16Type, typed_value, index)
994        }
995        DataType::Float32 => {
996            primitive_conversion_single_value!(Float32Type, typed_value, index)
997        }
998        DataType::Float64 => {
999            primitive_conversion_single_value!(Float64Type, typed_value, index)
1000        }
1001        DataType::Decimal32(_, s) => {
1002            generic_conversion_single_value_with_result!(
1003                Decimal32Type,
1004                as_primitive,
1005                |v| VariantDecimal4::try_new(v, *s as u8),
1006                typed_value,
1007                index
1008            )
1009        }
1010        DataType::Decimal64(_, s) => {
1011            generic_conversion_single_value_with_result!(
1012                Decimal64Type,
1013                as_primitive,
1014                |v| VariantDecimal8::try_new(v, *s as u8),
1015                typed_value,
1016                index
1017            )
1018        }
1019        DataType::Decimal128(_, s) => {
1020            generic_conversion_single_value_with_result!(
1021                Decimal128Type,
1022                as_primitive,
1023                |v| VariantDecimal16::try_new(v, *s as u8),
1024                typed_value,
1025                index
1026            )
1027        }
1028        DataType::Date32 => {
1029            generic_conversion_single_value!(
1030                Date32Type,
1031                as_primitive,
1032                |v| Date32Type::to_naive_date_opt(v).unwrap(),
1033                typed_value,
1034                index
1035            )
1036        }
1037        DataType::Time64(TimeUnit::Microsecond) => {
1038            generic_conversion_single_value_with_result!(
1039                Time64MicrosecondType,
1040                as_primitive,
1041                |v| NaiveTime::from_num_seconds_from_midnight_opt(
1042                    (v / 1_000_000) as u32,
1043                    (v % 1_000_000) as u32 * 1000
1044                )
1045                .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1046                typed_value,
1047                index
1048            )
1049        }
1050        DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1051            generic_conversion_single_value!(
1052                TimestampMicrosecondType,
1053                as_primitive,
1054                |v| DateTime::from_timestamp_micros(v).unwrap(),
1055                typed_value,
1056                index
1057            )
1058        }
1059        DataType::Timestamp(TimeUnit::Microsecond, None) => {
1060            generic_conversion_single_value!(
1061                TimestampMicrosecondType,
1062                as_primitive,
1063                |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1064                typed_value,
1065                index
1066            )
1067        }
1068        DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1069            generic_conversion_single_value!(
1070                TimestampNanosecondType,
1071                as_primitive,
1072                DateTime::from_timestamp_nanos,
1073                typed_value,
1074                index
1075            )
1076        }
1077        DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1078            generic_conversion_single_value!(
1079                TimestampNanosecondType,
1080                as_primitive,
1081                |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1082                typed_value,
1083                index
1084            )
1085        }
1086        // todo other types here (note this is very similar to cast_to_variant.rs)
1087        // so it would be great to figure out how to share this code
1088        _ => {
1089            // We shouldn't panic in production code, but this is a
1090            // placeholder until we implement more types
1091            // https://github.com/apache/arrow-rs/issues/8091
1092            debug_assert!(
1093                false,
1094                "Unsupported typed_value type: {}",
1095                typed_value.data_type()
1096            );
1097            Ok(Variant::Null)
1098        }
1099    }
1100}
1101
1102/// Workaround for lack of direct support for BinaryArray
1103/// <https://github.com/apache/arrow-rs/issues/8387>
1104///
1105/// The values are read as
1106/// * `StructArray<metadata: Binary, value: Binary>`
1107///
1108/// but VariantArray needs them as
1109/// * `StructArray<metadata: BinaryView, value: BinaryView>`
1110///
1111/// So cast them to get the right type.
1112fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
1113    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1114    if let Cow::Borrowed(_) = new_type {
1115        if let Some(array) = array.as_struct_opt() {
1116            return Ok(Arc::new(array.clone())); // bypass the unnecessary cast
1117        }
1118    }
1119    cast(array, new_type.as_ref())
1120}
1121
1122/// Recursively visits a data type, ensuring that it only contains data types that can legally
1123/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView,
1124/// since that's what comes back from the parquet reader and what the variant code expects to find.
1125fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1126    use DataType::*;
1127
1128    // helper macros
1129    macro_rules! fail {
1130        () => {
1131            return Err(ArrowError::InvalidArgumentError(format!(
1132                "Illegal shredded value type: {data_type}"
1133            )))
1134        };
1135    }
1136    macro_rules! borrow {
1137        () => {
1138            Cow::Borrowed(data_type)
1139        };
1140    }
1141
1142    let new_data_type = match data_type {
1143        // Primitive arrow types that have a direct variant counterpart are allowed
1144        Null | Boolean => borrow!(),
1145        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1146
1147        // Unsigned integers and half-float are not allowed
1148        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1149
1150        // Most decimal types are allowed, with restrictions on precision and scale
1151        //
1152        // NOTE: arrow-parquet reads widens 32- and 64-bit decimals to 128-bit, but the variant spec
1153        // requires using the narrowest decimal type for a given precision. Fix those up first.
1154        Decimal64(p, s) | Decimal128(p, s)
1155            if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1156        {
1157            Cow::Owned(Decimal32(*p, *s))
1158        }
1159        Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1160            Cow::Owned(Decimal64(*p, *s))
1161        }
1162        Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1163        Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1164        Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1165        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1166
1167        // Only micro and nano timestamps are allowed
1168        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1169        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1170
1171        // Only 32-bit dates and 64-bit microsecond time are allowed.
1172        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1173        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1174
1175        // Binary and string are allowed. Force Binary/LargeBinary to BinaryView because that's what the parquet
1176        // reader returns and what the rest of the variant code expects.
1177        Binary | LargeBinary => Cow::Owned(BinaryView),
1178        BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1179
1180        // UUID maps to 16-byte fixed-size binary; no other width is allowed
1181        FixedSizeBinary(16) => borrow!(),
1182        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1183
1184        // List-like containers and struct are allowed, maps and unions are not
1185        List(field) => match canonicalize_and_verify_field(field)? {
1186            Cow::Borrowed(_) => borrow!(),
1187            Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1188        },
1189        LargeList(field) => match canonicalize_and_verify_field(field)? {
1190            Cow::Borrowed(_) => borrow!(),
1191            Cow::Owned(new_field) => Cow::Owned(DataType::LargeList(new_field)),
1192        },
1193        ListView(field) => match canonicalize_and_verify_field(field)? {
1194            Cow::Borrowed(_) => borrow!(),
1195            Cow::Owned(new_field) => Cow::Owned(DataType::ListView(new_field)),
1196        },
1197        LargeListView(field) => match canonicalize_and_verify_field(field)? {
1198            Cow::Borrowed(_) => borrow!(),
1199            Cow::Owned(new_field) => Cow::Owned(DataType::LargeListView(new_field)),
1200        },
1201        // Struct is used by the internal layout, and can also represent a shredded variant object.
1202        Struct(fields) => {
1203            // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning
1204            // of the data type. Even if some fields change, the others are shallow arc clones.
1205            let mut new_fields = std::collections::HashMap::new();
1206            for (i, field) in fields.iter().enumerate() {
1207                if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1208                    new_fields.insert(i, new_field);
1209                }
1210            }
1211
1212            if new_fields.is_empty() {
1213                borrow!()
1214            } else {
1215                let new_fields = fields
1216                    .iter()
1217                    .enumerate()
1218                    .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1219                Cow::Owned(DataType::Struct(new_fields.collect()))
1220            }
1221        }
1222        Map(..) | Union(..) => fail!(),
1223
1224        // We can _possibly_ support (some of) these some day?
1225        Dictionary(..) | RunEndEncoded(..) => fail!(),
1226    };
1227    Ok(new_data_type)
1228}
1229
1230fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1231    let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1232        return Ok(Cow::Borrowed(field));
1233    };
1234    let new_field = field.as_ref().clone().with_data_type(new_data_type);
1235    Ok(Cow::Owned(Arc::new(new_field)))
1236}
1237
1238#[cfg(test)]
1239mod test {
1240    use crate::VariantArrayBuilder;
1241    use std::str::FromStr;
1242
1243    use super::*;
1244    use arrow::array::{
1245        BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array, Int64Array,
1246        LargeListArray, LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray,
1247    };
1248    use arrow::buffer::{OffsetBuffer, ScalarBuffer};
1249    use arrow_schema::{Field, Fields};
1250    use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1251
1252    #[test]
1253    fn invalid_not_a_struct_array() {
1254        let array = make_binary_view_array();
1255        // Should fail because the input is not a StructArray
1256        let err = VariantArray::try_new(&array);
1257        assert_eq!(
1258            err.unwrap_err().to_string(),
1259            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1260        );
1261    }
1262
1263    #[test]
1264    fn invalid_missing_metadata() {
1265        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1266        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1267        // Should fail because the StructArray does not contain a 'metadata' field
1268        let err = VariantArray::try_new(&array);
1269        assert_eq!(
1270            err.unwrap_err().to_string(),
1271            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1272        );
1273    }
1274
1275    #[test]
1276    fn all_null_missing_value_and_typed_value() {
1277        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1278        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1279
1280        // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
1281        // should be invalid, but we currently allow it and treat it as Variant::Null.
1282        // This is a pragmatic decision to handle missing data gracefully.
1283        let variant_array = VariantArray::try_new(&array).unwrap();
1284
1285        // Verify the shredding state is AllNull
1286        assert!(matches!(
1287            variant_array.shredding_state(),
1288            ShreddingState {
1289                value: None,
1290                typed_value: None
1291            }
1292        ));
1293
1294        // Verify that value() returns Variant::Null (compensating for spec violation)
1295        for i in 0..variant_array.len() {
1296            if variant_array.is_valid(i) {
1297                assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1298            }
1299        }
1300    }
1301
1302    #[test]
1303    fn invalid_metadata_field_type() {
1304        let fields = Fields::from(vec![
1305            Field::new("metadata", DataType::Int32, true), // not supported
1306            Field::new("value", DataType::BinaryView, true),
1307        ]);
1308        let array = StructArray::new(
1309            fields,
1310            vec![make_int32_array(), make_binary_view_array()],
1311            None,
1312        );
1313        let err = VariantArray::try_new(&array);
1314        assert_eq!(
1315            err.unwrap_err().to_string(),
1316            "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32"
1317        );
1318    }
1319
1320    #[test]
1321    fn invalid_value_field_type() {
1322        let fields = Fields::from(vec![
1323            Field::new("metadata", DataType::BinaryView, true),
1324            Field::new("value", DataType::Int32, true), // Not yet supported
1325        ]);
1326        let array = StructArray::new(
1327            fields,
1328            vec![make_binary_view_array(), make_int32_array()],
1329            None,
1330        );
1331        let err = VariantArray::try_new(&array);
1332        assert_eq!(
1333            err.unwrap_err().to_string(),
1334            "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32"
1335        );
1336    }
1337
1338    fn make_binary_view_array() -> ArrayRef {
1339        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1340    }
1341
1342    fn make_int32_array() -> ArrayRef {
1343        Arc::new(Int32Array::from(vec![1]))
1344    }
1345
1346    fn make_variant_struct_with_typed_value(typed_value: ArrayRef) -> StructArray {
1347        let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1348            EMPTY_VARIANT_METADATA_BYTES,
1349            typed_value.len(),
1350        ));
1351        StructArrayBuilder::new()
1352            .with_field("metadata", Arc::new(metadata), false)
1353            .with_field("typed_value", typed_value, true)
1354            .build()
1355    }
1356
1357    #[test]
1358    fn all_null_shredding_state() {
1359        // Verify the shredding state is AllNull
1360        assert!(matches!(
1361            ShreddingState::new(None, None),
1362            ShreddingState {
1363                value: None,
1364                typed_value: None
1365            }
1366        ));
1367    }
1368
1369    #[test]
1370    fn all_null_variant_array_construction() {
1371        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1372        let nulls = NullBuffer::from(vec![false, false, false]); // all null
1373
1374        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1375        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1376
1377        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1378
1379        // Verify the shredding state is AllNull
1380        assert!(matches!(
1381            variant_array.shredding_state(),
1382            ShreddingState {
1383                value: None,
1384                typed_value: None
1385            }
1386        ));
1387
1388        // Verify all values are null
1389        assert_eq!(variant_array.len(), 3);
1390        assert!(!variant_array.is_valid(0));
1391        assert!(!variant_array.is_valid(1));
1392        assert!(!variant_array.is_valid(2));
1393
1394        // Verify that value() returns Variant::Null for all indices
1395        for i in 0..variant_array.len() {
1396            assert!(
1397                !variant_array.is_valid(i),
1398                "Expected value at index {i} to be null"
1399            );
1400        }
1401    }
1402
1403    #[test]
1404    fn value_field_present_but_all_null_should_be_unshredded() {
1405        // This test demonstrates the issue: when a value field exists in schema
1406        // but all its values are null, it should remain Unshredded, not AllNull
1407        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1408
1409        // Create a value field with all null values
1410        let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
1411        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1412        let value_data = value_array
1413            .to_data()
1414            .into_builder()
1415            .nulls(Some(value_nulls))
1416            .build()
1417            .unwrap();
1418        let value = BinaryViewArray::from(value_data);
1419
1420        let fields = Fields::from(vec![
1421            Field::new("metadata", DataType::BinaryView, false),
1422            Field::new("value", DataType::BinaryView, true), // Field exists in schema
1423        ]);
1424        let struct_array = StructArray::new(
1425            fields,
1426            vec![Arc::new(metadata), Arc::new(value)],
1427            None, // struct itself is not null, just the value field is all null
1428        );
1429
1430        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1431
1432        // This should be Unshredded, not AllNull, because value field exists in schema
1433        assert!(matches!(
1434            variant_array.shredding_state(),
1435            ShreddingState {
1436                value: Some(_),
1437                typed_value: None
1438            }
1439        ));
1440    }
1441
1442    #[test]
1443    fn canonicalize_and_verify_list_like_data_types() {
1444        // `parquet/tests/variant_integration.rs` validates Parquet shredded-variant fixtures that
1445        // use Parquet LIST encoding, but those fixtures do not cover Arrow-specific list container
1446        // variants (`LargeList`, `ListView`, `LargeListView`) accepted by `VariantArray::try_new`.
1447        let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true));
1448        let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true));
1449
1450        let cases = vec![
1451            (
1452                DataType::LargeList(make_item_binary()),
1453                DataType::LargeList(make_item_binary_view()),
1454            ),
1455            (
1456                DataType::ListView(make_item_binary()),
1457                DataType::ListView(make_item_binary_view()),
1458            ),
1459            (
1460                DataType::LargeListView(make_item_binary()),
1461                DataType::LargeListView(make_item_binary_view()),
1462            ),
1463        ];
1464
1465        for (input, expected) in cases {
1466            assert_eq!(
1467                canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
1468                &expected
1469            );
1470        }
1471    }
1472
1473    #[test]
1474    fn variant_array_try_new_supports_list_like_typed_value() {
1475        let item_field = Arc::new(Field::new("item", DataType::Int64, true));
1476        let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
1477
1478        let typed_values = vec![
1479            Arc::new(ListArray::new(
1480                item_field.clone(),
1481                OffsetBuffer::new(ScalarBuffer::from(vec![0, 2, 3])),
1482                values.clone(),
1483                None,
1484            )) as ArrayRef,
1485            Arc::new(LargeListArray::new(
1486                item_field.clone(),
1487                OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 3])),
1488                values.clone(),
1489                None,
1490            )) as ArrayRef,
1491            Arc::new(ListViewArray::new(
1492                item_field.clone(),
1493                ScalarBuffer::from(vec![0, 2]),
1494                ScalarBuffer::from(vec![2, 1]),
1495                values.clone(),
1496                None,
1497            )) as ArrayRef,
1498            Arc::new(LargeListViewArray::new(
1499                item_field,
1500                ScalarBuffer::from(vec![0_i64, 2]),
1501                ScalarBuffer::from(vec![2_i64, 1]),
1502                values,
1503                None,
1504            )) as ArrayRef,
1505        ];
1506
1507        for typed_value in typed_values {
1508            let input = make_variant_struct_with_typed_value(typed_value.clone());
1509            let variant_array = VariantArray::try_new(&input).unwrap();
1510            assert_eq!(
1511                variant_array.typed_value_field().unwrap().data_type(),
1512                typed_value.data_type(),
1513            );
1514        }
1515    }
1516
1517    #[test]
1518    fn test_variant_array_iterable() {
1519        let mut b = VariantArrayBuilder::new(6);
1520
1521        b.append_null();
1522        b.append_variant(Variant::from(1_i8));
1523        b.append_variant(Variant::Null);
1524        b.append_variant(Variant::from(2_i32));
1525        b.append_variant(Variant::from(3_i64));
1526        b.append_null();
1527
1528        let v = b.build();
1529
1530        let variants = v.iter().collect::<Vec<_>>();
1531
1532        assert_eq!(
1533            variants,
1534            vec![
1535                None,
1536                Some(Variant::Int8(1)),
1537                Some(Variant::Null),
1538                Some(Variant::Int32(2)),
1539                Some(Variant::Int64(3)),
1540                None,
1541            ]
1542        );
1543    }
1544
1545    #[test]
1546    fn test_variant_array_iter_double_ended() {
1547        let mut b = VariantArrayBuilder::new(5);
1548
1549        b.append_variant(Variant::from(0_i32));
1550        b.append_null();
1551        b.append_variant(Variant::from(2_i32));
1552        b.append_null();
1553        b.append_variant(Variant::from(4_i32));
1554
1555        let array = b.build();
1556        let mut iter = array.iter();
1557
1558        assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1559        assert_eq!(iter.next(), Some(None));
1560
1561        assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1562        assert_eq!(iter.next_back(), Some(None));
1563        assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1564
1565        assert_eq!(iter.next_back(), None);
1566        assert_eq!(iter.next(), None);
1567    }
1568
1569    #[test]
1570    fn test_variant_array_iter_reverse() {
1571        let mut b = VariantArrayBuilder::new(5);
1572
1573        b.append_variant(Variant::from("a"));
1574        b.append_null();
1575        b.append_variant(Variant::from("aaa"));
1576        b.append_null();
1577        b.append_variant(Variant::from("aaaaa"));
1578
1579        let array = b.build();
1580
1581        let result: Vec<_> = array.iter().rev().collect();
1582        assert_eq!(
1583            result,
1584            vec![
1585                Some(Variant::from("aaaaa")),
1586                None,
1587                Some(Variant::from("aaa")),
1588                None,
1589                Some(Variant::from("a")),
1590            ]
1591        );
1592    }
1593
1594    #[test]
1595    fn test_variant_array_iter_empty() {
1596        let v = VariantArrayBuilder::new(0).build();
1597        let mut i = v.iter();
1598        assert!(i.next().is_none());
1599        assert!(i.next_back().is_none());
1600    }
1601
1602    #[test]
1603    fn test_from_variant_opts_into_variant_array() {
1604        let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1605
1606        let variant_array = VariantArray::from_iter(v);
1607
1608        assert_eq!(variant_array.len(), 4);
1609
1610        assert!(variant_array.is_null(0));
1611
1612        assert!(!variant_array.is_null(1));
1613        assert_eq!(variant_array.value(1), Variant::Null);
1614
1615        assert!(!variant_array.is_null(2));
1616        assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1617
1618        assert!(variant_array.is_null(3));
1619    }
1620
1621    #[test]
1622    fn test_from_variants_into_variant_array() {
1623        let v = vec![
1624            Variant::Null,
1625            Variant::BooleanFalse,
1626            Variant::ShortString(ShortString::try_new("norm").unwrap()),
1627        ];
1628
1629        let variant_array = VariantArray::from_iter(v);
1630
1631        assert_eq!(variant_array.len(), 3);
1632
1633        assert!(!variant_array.is_null(0));
1634        assert_eq!(variant_array.value(0), Variant::Null);
1635
1636        assert!(!variant_array.is_null(1));
1637        assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1638
1639        assert!(!variant_array.is_null(2));
1640        assert_eq!(
1641            variant_array.value(2),
1642            Variant::ShortString(ShortString::try_new("norm").unwrap())
1643        );
1644    }
1645
1646    #[test]
1647    fn test_variant_equality() {
1648        let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1649        let v = VariantArray::from_iter(v_iter.clone());
1650
1651        {
1652            let v_copy = v.clone();
1653            assert_eq!(v, v_copy);
1654        }
1655
1656        {
1657            let v_iter_reversed = v_iter.iter().cloned().rev();
1658            let v_reversed = VariantArray::from_iter(v_iter_reversed);
1659
1660            assert_ne!(v, v_reversed);
1661        }
1662
1663        {
1664            let v_sliced = v.slice(0, 1);
1665            assert_ne!(v, v_sliced);
1666        }
1667    }
1668
1669    macro_rules! invalid_variant_array_test {
1670        ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1671            #[test]
1672            fn $fn_name() {
1673                let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1674                    EMPTY_VARIANT_METADATA_BYTES,
1675                    1,
1676                ));
1677                let invalid_typed_value = $invalid_typed_value;
1678
1679                let struct_array = StructArrayBuilder::new()
1680                    .with_field("metadata", Arc::new(metadata), false)
1681                    .with_field("typed_value", Arc::new(invalid_typed_value), true)
1682                    .build();
1683
1684                let array: VariantArray = VariantArray::try_new(&struct_array)
1685                    .expect("should create variant array")
1686                    .into();
1687
1688                let result = array.try_value(0);
1689                assert!(result.is_err());
1690                let error = result.unwrap_err();
1691                assert!(matches!(error, ArrowError::CastError(_)));
1692
1693                let expected: &str = $error_msg;
1694                assert!(
1695                    error.to_string().contains($error_msg),
1696                    "error `{}` did not contain `{}`",
1697                    error,
1698                    expected
1699                )
1700            }
1701        };
1702    }
1703
1704    invalid_variant_array_test!(
1705        test_variant_array_invalide_time,
1706        Time64MicrosecondArray::from(vec![Some(86401000000)]),
1707        "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1708    );
1709
1710    invalid_variant_array_test!(
1711        test_variant_array_invalid_decimal32,
1712        Decimal32Array::from(vec![Some(1234567890)]),
1713        "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1714    );
1715
1716    invalid_variant_array_test!(
1717        test_variant_array_invalid_decimal64,
1718        Decimal64Array::from(vec![Some(1234567890123456789)]),
1719        "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1720    );
1721
1722    invalid_variant_array_test!(
1723        test_variant_array_invalid_decimal128,
1724        Decimal128Array::from(vec![Some(
1725            i128::from_str("123456789012345678901234567890123456789").unwrap()
1726        ),]),
1727        "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1728    );
1729}