parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22    generic_conversion_single_value, generic_conversion_single_value_with_result,
23    primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29    Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31    TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::ExtensionType;
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38    Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44/// Arrow Variant [`ExtensionType`].
45///
46/// Represents the canonical Arrow Extension Type for storing variants.
47/// See [`VariantArray`] for more examples of using this extension type.
48pub struct VariantType;
49
50impl ExtensionType for VariantType {
51    const NAME: &'static str = "arrow.parquet.variant";
52
53    // Variants extension metadata is an empty string
54    // <https://github.com/apache/arrow/blob/d803afcc43f5d132506318fd9e162d33b2c3d4cd/docs/source/format/CanonicalExtensions.rst?plain=1#L473>
55    type Metadata = &'static str;
56
57    fn metadata(&self) -> &Self::Metadata {
58        &""
59    }
60
61    fn serialize_metadata(&self) -> Option<String> {
62        Some(String::new())
63    }
64
65    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
66        Ok("")
67    }
68
69    fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
70        if matches!(data_type, DataType::Struct(_)) {
71            Ok(())
72        } else {
73            Err(ArrowError::InvalidArgumentError(format!(
74                "VariantType only supports StructArray, got {data_type}"
75            )))
76        }
77    }
78
79    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
80        Self.supports_data_type(data_type)?;
81        Ok(Self)
82    }
83}
84
85/// An array of Parquet [`Variant`] values
86///
87/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
88/// `metadata` and `value` fields, and adds convenience methods to access
89/// the [`Variant`]s.
90///
91/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row.
92///
93/// See the examples below from converting between `VariantArray` and
94/// `StructArray`.
95///
96/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
97///
98/// # Documentation
99///
100/// At the time of this writing, Variant has been accepted as an official
101/// extension type but not been published to the [official list of extension
102/// types] on the Apache Arrow website. See the [Extension Type for Parquet
103/// Variant arrow] ticket for more details.
104///
105/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
106/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
107///
108/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension
109///
110/// Arrow Arrays only provide [`DataType`], but the extension type information
111/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or
112/// [`Field`] to check for the extension type.
113///
114/// [`Schema`]: arrow_schema::Schema
115/// ```
116/// # use arrow::array::StructArray;
117/// # use arrow_schema::{Schema, Field, DataType};
118/// # use parquet_variant::Variant;
119/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
120/// # fn get_variant_array() -> VariantArray {
121/// #   let mut builder = VariantArrayBuilder::new(10);
122/// #   builder.append_variant(Variant::from("such wow"));
123/// #   builder.build()
124/// # }
125/// # fn get_schema() -> Schema {
126/// #   Schema::new(vec![
127/// #     Field::new("id", DataType::Int32, false),
128/// #     get_variant_array().field("var"),
129/// #   ])
130/// # }
131/// let schema = get_schema();
132/// assert_eq!(schema.fields().len(), 2);
133/// // first field is not a Variant
134/// assert!(schema.field(0).try_extension_type::<VariantType>().is_err());
135/// // second field is a Variant
136/// assert!(schema.field(1).try_extension_type::<VariantType>().is_ok());
137/// ```
138///
139/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
140///
141/// You can construct the correct [`Field`] for a [`VariantArray`] using the
142/// [`VariantArray::field`] method.
143///
144/// ```
145/// # use arrow_schema::{Schema, Field, DataType};
146/// # use parquet_variant::Variant;
147/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
148/// # fn get_variant_array() -> VariantArray {
149/// #   let mut builder = VariantArrayBuilder::new(10);
150/// #   builder.append_variant(Variant::from("such wow"));
151/// #   builder.build()
152/// # }
153/// let variant_array = get_variant_array();
154/// // First field is an integer id, second field is a variant
155/// let schema = Schema::new(vec![
156///   Field::new("id", DataType::Int32, false),
157///   // call VariantArray::field to get the correct Field
158///   variant_array.field("var"),
159/// ]);
160/// ```
161///
162/// You can also construct the [`Field`] using [`VariantType`] directly
163///
164/// ```
165/// # use arrow_schema::{Schema, Field, DataType};
166/// # use parquet_variant::Variant;
167/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
168/// # fn get_variant_array() -> VariantArray {
169/// #   let mut builder = VariantArrayBuilder::new(10);
170/// #   builder.append_variant(Variant::from("such wow"));
171/// #   builder.build()
172/// # }
173/// # let variant_array = get_variant_array();
174/// // The DataType of a VariantArray varies depending on how it is shredded
175/// let data_type = variant_array.data_type().clone();
176/// // First field is an integer id, second field is a variant
177/// let schema = Schema::new(vec![
178///   Field::new("id", DataType::Int32, false),
179///   Field::new("var", data_type, false)
180///     // Add extension metadata to the field using `VariantType`
181///     .with_extension_type(VariantType),
182/// ]);
183/// ```
184///
185/// # Example: Converting a [`VariantArray`] to a [`StructArray`]
186///
187/// ```
188/// # use arrow::array::StructArray;
189/// # use parquet_variant::Variant;
190/// # use parquet_variant_compute::VariantArrayBuilder;
191/// // Create Variant Array
192/// let mut builder = VariantArrayBuilder::new(10);
193/// builder.append_variant(Variant::from("such wow"));
194/// let variant_array = builder.build();
195/// // convert to StructArray
196/// let struct_array: StructArray = variant_array.into();
197/// ```
198///
199/// # Example: Converting a [`StructArray`] to a [`VariantArray`]
200///
201/// ```
202/// # use arrow::array::StructArray;
203/// # use parquet_variant::Variant;
204/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray};
205/// # fn get_struct_array() -> StructArray {
206/// #   let mut builder = VariantArrayBuilder::new(10);
207/// #   builder.append_variant(Variant::from("such wow"));
208/// #   builder.build().into()
209/// # }
210/// let struct_array: StructArray = get_struct_array();
211/// // try and create a VariantArray from it
212/// let variant_array = VariantArray::try_new(&struct_array).unwrap();
213/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
214/// ```
215///
216#[derive(Debug, Clone, PartialEq)]
217pub struct VariantArray {
218    /// Reference to the underlying StructArray
219    inner: StructArray,
220
221    /// The metadata column of this variant
222    metadata: BinaryViewArray,
223
224    /// how is this variant array shredded?
225    shredding_state: ShreddingState,
226}
227
228impl VariantArray {
229    /// Creates a new `VariantArray` from a [`StructArray`].
230    ///
231    /// # Arguments
232    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
233    ///
234    /// # Returns
235    /// - A new instance of `VariantArray`.
236    ///
237    /// # Errors:
238    /// - If the `StructArray` does not contain the required fields
239    ///
240    /// # Requirements of the `StructArray`
241    ///
242    /// 1. A required field named `metadata` which is binary, large_binary, or
243    ///    binary_view
244    ///
245    /// 2. An optional field named `value` that is binary, large_binary, or
246    ///    binary_view
247    ///
248    /// 3. An optional field named `typed_value` which can be any primitive type
249    ///    or be a list, large_list, list_view or struct
250    ///
251    /// NOTE: It is also permissible for the metadata field to be
252    /// Dictionary-Encoded, preferably (but not required) with an index type of
253    /// int8.
254    ///
255    /// Currently, only [`BinaryViewArray`] are supported.
256    pub fn try_new(inner: &dyn Array) -> Result<Self> {
257        // Workaround lack of support for Binary
258        // https://github.com/apache/arrow-rs/issues/8387
259        let inner = cast_to_binary_view_arrays(inner)?;
260
261        let Some(inner) = inner.as_struct_opt() else {
262            return Err(ArrowError::InvalidArgumentError(
263                "Invalid VariantArray: requires StructArray as input".to_string(),
264            ));
265        };
266
267        // Note the specification allows for any order so we must search by name
268
269        // Ensure the StructArray has a metadata field of BinaryView
270        let Some(metadata_field) = inner.column_by_name("metadata") else {
271            return Err(ArrowError::InvalidArgumentError(
272                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
273            ));
274        };
275        let Some(metadata) = metadata_field.as_binary_view_opt() else {
276            return Err(ArrowError::NotYetImplemented(format!(
277                "VariantArray 'metadata' field must be BinaryView, got {}",
278                metadata_field.data_type()
279            )));
280        };
281
282        // Note these clones are cheap, they just bump the ref count
283        Ok(Self {
284            inner: inner.clone(),
285            metadata: metadata.clone(),
286            shredding_state: ShreddingState::try_from(inner)?,
287        })
288    }
289
290    pub(crate) fn from_parts(
291        metadata: BinaryViewArray,
292        value: Option<BinaryViewArray>,
293        typed_value: Option<ArrayRef>,
294        nulls: Option<NullBuffer>,
295    ) -> Self {
296        let mut builder =
297            StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false);
298        if let Some(value) = value.clone() {
299            builder = builder.with_field("value", Arc::new(value), true);
300        }
301        if let Some(typed_value) = typed_value.clone() {
302            builder = builder.with_field("typed_value", typed_value, true);
303        }
304        if let Some(nulls) = nulls {
305            builder = builder.with_nulls(nulls);
306        }
307
308        Self {
309            inner: builder.build(),
310            metadata,
311            shredding_state: ShreddingState::new(value, typed_value),
312        }
313    }
314
315    /// Returns a reference to the underlying [`StructArray`].
316    pub fn inner(&self) -> &StructArray {
317        &self.inner
318    }
319
320    /// Returns the inner [`StructArray`], consuming self
321    pub fn into_inner(self) -> StructArray {
322        self.inner
323    }
324
325    /// Return the shredding state of this `VariantArray`
326    pub fn shredding_state(&self) -> &ShreddingState {
327        &self.shredding_state
328    }
329
330    /// Return the [`Variant`] instance stored at the given row
331    ///
332    /// This is a convenience wrapper that calls [`VariantArray::try_value`] and unwraps the `Result`.
333    /// Use `try_value` if you need to handle conversion errors gracefully.
334    ///
335    /// # Panics
336    /// * if the index is out of bounds
337    /// * if the array value is null
338    /// * if `try_value` returns an error.
339    pub fn value(&self, index: usize) -> Variant<'_, '_> {
340        self.try_value(index).unwrap()
341    }
342
343    /// Return the [`Variant`] instance stored at the given row
344    ///
345    /// Note: This method does not check for nulls and the value is arbitrary
346    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
347    ///
348    /// # Panics
349    ///
350    /// Panics if
351    /// * the index is out of bounds
352    /// * the array value is null
353    ///
354    /// # Errors
355    ///
356    /// Errors if
357    /// - the data in `typed_value` cannot be interpreted as a valid `Variant`
358    ///
359    /// If this is a shredded variant but has no value at the shredded location, it
360    /// will return [`Variant::Null`].
361    ///
362    ///
363    /// # Performance Note
364    ///
365    /// This is certainly not the most efficient way to access values in a
366    /// `VariantArray`, but it is useful for testing and debugging.
367    ///
368    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
369    /// caller to ensure that the metadata and value were constructed correctly.
370    pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
371        match (self.typed_value_field(), self.value_field()) {
372            // Always prefer typed_value, if available
373            (Some(typed_value), value) if typed_value.is_valid(index) => {
374                typed_value_to_variant(typed_value, value, index)
375            }
376            // Otherwise fall back to value, if available
377            (_, Some(value)) if value.is_valid(index) => {
378                Ok(Variant::new(self.metadata.value(index), value.value(index)))
379            }
380            // It is technically invalid for neither value nor typed_value fields to be available,
381            // but the spec specifically requires readers to return Variant::Null in this case.
382            _ => Ok(Variant::Null),
383        }
384    }
385
386    /// Return a reference to the metadata field of the [`StructArray`]
387    pub fn metadata_field(&self) -> &BinaryViewArray {
388        &self.metadata
389    }
390
391    /// Return a reference to the value field of the `StructArray`
392    pub fn value_field(&self) -> Option<&BinaryViewArray> {
393        self.shredding_state.value_field()
394    }
395
396    /// Return a reference to the typed_value field of the `StructArray`, if present
397    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
398        self.shredding_state.typed_value_field()
399    }
400
401    /// Return a field to represent this VariantArray in a `Schema` with
402    /// a particular name
403    pub fn field(&self, name: impl Into<String>) -> Field {
404        Field::new(
405            name.into(),
406            self.data_type().clone(),
407            self.inner.is_nullable(),
408        )
409        .with_extension_type(VariantType)
410    }
411
412    /// Returns a new DataType representing this VariantArray's inner type
413    pub fn data_type(&self) -> &DataType {
414        self.inner.data_type()
415    }
416
417    pub fn slice(&self, offset: usize, length: usize) -> Self {
418        let inner = self.inner.slice(offset, length);
419        let metadata = self.metadata.slice(offset, length);
420        let shredding_state = self.shredding_state.slice(offset, length);
421        Self {
422            inner,
423            metadata,
424            shredding_state,
425        }
426    }
427
428    pub fn len(&self) -> usize {
429        self.inner.len()
430    }
431
432    pub fn is_empty(&self) -> bool {
433        self.inner.is_empty()
434    }
435
436    pub fn nulls(&self) -> Option<&NullBuffer> {
437        self.inner.nulls()
438    }
439
440    /// Is the element at index null?
441    pub fn is_null(&self, index: usize) -> bool {
442        self.nulls().is_some_and(|n| n.is_null(index))
443    }
444
445    /// Is the element at index valid (not null)?
446    pub fn is_valid(&self, index: usize) -> bool {
447        !self.is_null(index)
448    }
449
450    /// Returns an iterator over the values in this array
451    pub fn iter(&self) -> VariantArrayIter<'_> {
452        VariantArrayIter::new(self)
453    }
454}
455
456impl From<VariantArray> for StructArray {
457    fn from(variant_array: VariantArray) -> Self {
458        variant_array.into_inner()
459    }
460}
461
462impl From<VariantArray> for ArrayRef {
463    fn from(variant_array: VariantArray) -> Self {
464        Arc::new(variant_array.into_inner())
465    }
466}
467
468impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
469    fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
470        let iter = iter.into_iter();
471
472        let mut b = VariantArrayBuilder::new(iter.size_hint().0);
473        b.extend(iter);
474        b.build()
475    }
476}
477
478impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
479    fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
480        Self::from_iter(iter.into_iter().map(Some))
481    }
482}
483
484/// An iterator over [`VariantArray`]
485///
486/// This iterator returns `Option<Option<Variant<'a, 'a>>>` where:
487/// - `None` indicates the end of iteration
488/// - `Some(None)` indicates a null value at this position
489/// - `Some(Some(variant))` indicates a valid variant value
490///
491/// # Example
492///
493/// ```
494/// # use parquet_variant::Variant;
495/// # use parquet_variant_compute::VariantArrayBuilder;
496/// let mut builder = VariantArrayBuilder::new(10);
497/// builder.append_variant(Variant::from(42));
498/// builder.append_null();
499/// builder.append_variant(Variant::from("hello"));
500/// let array = builder.build();
501///
502/// let values = array.iter().collect::<Vec<_>>();
503/// assert_eq!(values.len(), 3);
504/// assert_eq!(values[0], Some(Variant::from(42)));
505/// assert_eq!(values[1], None);
506/// assert_eq!(values[2], Some(Variant::from("hello")));
507/// ```
508#[derive(Debug)]
509pub struct VariantArrayIter<'a> {
510    array: &'a VariantArray,
511    head_i: usize,
512    tail_i: usize,
513}
514
515impl<'a> VariantArrayIter<'a> {
516    /// Creates a new iterator over the given [`VariantArray`]
517    pub fn new(array: &'a VariantArray) -> Self {
518        Self {
519            array,
520            head_i: 0,
521            tail_i: array.len(),
522        }
523    }
524
525    fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
526        self.array.is_valid(i).then(|| self.array.value(i))
527    }
528}
529
530impl<'a> Iterator for VariantArrayIter<'a> {
531    type Item = Option<Variant<'a, 'a>>;
532
533    #[inline]
534    fn next(&mut self) -> Option<Self::Item> {
535        if self.head_i == self.tail_i {
536            return None;
537        }
538
539        let out = self.value_opt(self.head_i);
540
541        self.head_i += 1;
542
543        Some(out)
544    }
545
546    fn size_hint(&self) -> (usize, Option<usize>) {
547        let remainder = self.tail_i - self.head_i;
548
549        (remainder, Some(remainder))
550    }
551}
552
553impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
554    fn next_back(&mut self) -> Option<Self::Item> {
555        if self.head_i == self.tail_i {
556            return None;
557        }
558
559        self.tail_i -= 1;
560
561        Some(self.value_opt(self.tail_i))
562    }
563}
564
565impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
566
567/// One shredded field of a partially or perfectly shredded variant. For example, suppose the
568/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
569/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
570/// is:
571///
572/// ```text
573/// v: VARIANT {
574///     metadata: BINARY,
575///     value: BINARY,
576///     typed_value: STRUCT {
577///         a: SHREDDED_VARIANT_FIELD {
578///             value: BINARY,
579///             typed_value: STRUCT {
580///                 a: SHREDDED_VARIANT_FIELD {
581///                     value: BINARY,
582///                     typed_value: INT,
583///                 },
584///             },
585///         },
586///     },
587/// }
588/// ```
589///
590/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
591/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
592/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
593/// single expected field `a`).
594///
595/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
596/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
597/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
598///
599/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
600/// variant value (which could be `Variant::Null`).
601#[derive(Debug)]
602pub struct ShreddedVariantFieldArray {
603    /// Reference to the underlying StructArray
604    inner: StructArray,
605    shredding_state: ShreddingState,
606}
607
608#[allow(unused)]
609impl ShreddedVariantFieldArray {
610    /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
611    ///
612    /// # Arguments
613    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
614    ///
615    /// # Returns
616    /// - A new instance of `ShreddedVariantFieldArray`.
617    ///
618    /// # Errors:
619    /// - If the `StructArray` does not contain the required fields
620    ///
621    /// # Requirements of the `StructArray`
622    ///
623    /// 1. An optional field named `value` that is binary, large_binary, or
624    ///    binary_view
625    ///
626    /// 2. An optional field named `typed_value` which can be any primitive type
627    ///    or be a list, large_list, list_view or struct
628    ///
629    /// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
630    pub fn try_new(inner: &dyn Array) -> Result<Self> {
631        let Some(inner_struct) = inner.as_struct_opt() else {
632            return Err(ArrowError::InvalidArgumentError(
633                "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
634            ));
635        };
636
637        // Note this clone is cheap, it just bumps the ref count
638        Ok(Self {
639            inner: inner_struct.clone(),
640            shredding_state: ShreddingState::try_from(inner_struct)?,
641        })
642    }
643
644    /// Return the shredding state of this `VariantArray`
645    pub fn shredding_state(&self) -> &ShreddingState {
646        &self.shredding_state
647    }
648
649    /// Return a reference to the value field of the `StructArray`
650    pub fn value_field(&self) -> Option<&BinaryViewArray> {
651        self.shredding_state.value_field()
652    }
653
654    /// Return a reference to the typed_value field of the `StructArray`, if present
655    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
656        self.shredding_state.typed_value_field()
657    }
658
659    /// Returns a reference to the underlying [`StructArray`].
660    pub fn inner(&self) -> &StructArray {
661        &self.inner
662    }
663
664    pub(crate) fn from_parts(
665        value: Option<BinaryViewArray>,
666        typed_value: Option<ArrayRef>,
667        nulls: Option<NullBuffer>,
668    ) -> Self {
669        let mut builder = StructArrayBuilder::new();
670        if let Some(value) = value.clone() {
671            builder = builder.with_field("value", Arc::new(value), true);
672        }
673        if let Some(typed_value) = typed_value.clone() {
674            builder = builder.with_field("typed_value", typed_value, true);
675        }
676        if let Some(nulls) = nulls {
677            builder = builder.with_nulls(nulls);
678        }
679
680        Self {
681            inner: builder.build(),
682            shredding_state: ShreddingState::new(value, typed_value),
683        }
684    }
685
686    /// Returns the inner [`StructArray`], consuming self
687    pub fn into_inner(self) -> StructArray {
688        self.inner
689    }
690
691    pub fn data_type(&self) -> &DataType {
692        self.inner.data_type()
693    }
694
695    pub fn len(&self) -> usize {
696        self.inner.len()
697    }
698
699    pub fn is_empty(&self) -> bool {
700        self.inner.is_empty()
701    }
702
703    pub fn offset(&self) -> usize {
704        self.inner.offset()
705    }
706
707    pub fn nulls(&self) -> Option<&NullBuffer> {
708        // According to the shredding spec, ShreddedVariantFieldArray should be
709        // physically non-nullable - SQL NULL is inferred by both value and
710        // typed_value being physically NULL
711        None
712    }
713    /// Is the element at index null?
714    pub fn is_null(&self, index: usize) -> bool {
715        self.nulls().is_some_and(|n| n.is_null(index))
716    }
717
718    /// Is the element at index valid (not null)?
719    pub fn is_valid(&self, index: usize) -> bool {
720        !self.is_null(index)
721    }
722}
723
724impl From<ShreddedVariantFieldArray> for ArrayRef {
725    fn from(array: ShreddedVariantFieldArray) -> Self {
726        Arc::new(array.into_inner())
727    }
728}
729
730impl From<ShreddedVariantFieldArray> for StructArray {
731    fn from(array: ShreddedVariantFieldArray) -> Self {
732        array.into_inner()
733    }
734}
735
736/// Represents the shredding state of a [`VariantArray`]
737///
738/// [`VariantArray`]s can be shredded according to the [Parquet Variant
739/// Shredding Spec]. Shredding means that the actual value is stored in a typed
740/// `typed_field` instead of the generic `value` field.
741///
742/// Both value and typed_value are optional fields used together to encode a
743/// single value. Values in the two fields must be interpreted according to the
744/// following table (see [Parquet Variant Shredding Spec] for more details):
745///
746/// | value    | typed_value  | Meaning |
747/// |----------|--------------|---------|
748/// | NULL     | NULL         | The value is missing; only valid for shredded object fields |
749/// | non-NULL | NULL         | The value is present and may be any type, including [`Variant::Null`] |
750/// | NULL     | non-NULL     | The value is present and is the shredded type |
751/// | non-NULL | non-NULL     | The value is present and is a partially shredded object |
752///
753///
754/// Applying the above rules to entire columns, we obtain the following:
755///
756/// | value  | typed_value  | Meaning |
757/// |--------|-------------|---------|
758/// | --     | --          | **Missing**: The value is always missing; only valid for shredded object fields |
759/// | exists | --          | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
760/// | --     | exists      | **Perfectly shredded**: If present, the value is always the shredded type |
761/// | exists | exists      | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
762///
763/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
764/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
765/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
766/// (partial shredding).
767///
768/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
769#[derive(Debug, Clone, PartialEq)]
770pub struct ShreddingState {
771    value: Option<BinaryViewArray>,
772    typed_value: Option<ArrayRef>,
773}
774
775impl ShreddingState {
776    /// Create a new `ShreddingState` from the given `value` and `typed_value` fields
777    ///
778    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
779    /// `ShreddingState::try_from(&struct_array)`, for example:
780    ///
781    /// ```no_run
782    /// # use arrow::array::StructArray;
783    /// # use parquet_variant_compute::ShreddingState;
784    /// # fn get_struct_array() -> StructArray {
785    /// #   unimplemented!()
786    /// # }
787    /// let struct_array: StructArray = get_struct_array();
788    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
789    /// ```
790    pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
791        Self { value, typed_value }
792    }
793
794    /// Return a reference to the value field, if present
795    pub fn value_field(&self) -> Option<&BinaryViewArray> {
796        self.value.as_ref()
797    }
798
799    /// Return a reference to the typed_value field, if present
800    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
801        self.typed_value.as_ref()
802    }
803
804    /// Returns a borrowed version of this shredding state
805    pub fn borrow(&self) -> BorrowedShreddingState<'_> {
806        BorrowedShreddingState {
807            value: self.value_field(),
808            typed_value: self.typed_value_field(),
809        }
810    }
811
812    /// Slice all the underlying arrays
813    pub fn slice(&self, offset: usize, length: usize) -> Self {
814        Self {
815            value: self.value.as_ref().map(|v| v.slice(offset, length)),
816            typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
817        }
818    }
819}
820
821/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
822/// for avoiding clone operations when the caller does not need a self-standing shredding state.
823#[derive(Clone, Debug)]
824pub struct BorrowedShreddingState<'a> {
825    value: Option<&'a BinaryViewArray>,
826    typed_value: Option<&'a ArrayRef>,
827}
828
829impl<'a> BorrowedShreddingState<'a> {
830    /// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
831    ///
832    /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
833    /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
834    ///
835    /// ```no_run
836    /// # use arrow::array::StructArray;
837    /// # use parquet_variant_compute::BorrowedShreddingState;
838    /// # fn get_struct_array() -> StructArray {
839    /// #   unimplemented!()
840    /// # }
841    /// let struct_array: StructArray = get_struct_array();
842    /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
843    /// ```
844    pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
845        Self { value, typed_value }
846    }
847
848    /// Return a reference to the value field, if present
849    pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
850        self.value
851    }
852
853    /// Return a reference to the typed_value field, if present
854    pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
855        self.typed_value
856    }
857}
858
859impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
860    type Error = ArrowError;
861
862    fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
863        // The `value` column need not exist, but if it does it must be a binary view.
864        let value = if let Some(value_col) = inner_struct.column_by_name("value") {
865            let Some(binary_view) = value_col.as_binary_view_opt() else {
866                return Err(ArrowError::NotYetImplemented(format!(
867                    "VariantArray 'value' field must be BinaryView, got {}",
868                    value_col.data_type()
869                )));
870            };
871            Some(binary_view)
872        } else {
873            None
874        };
875        let typed_value = inner_struct.column_by_name("typed_value");
876        Ok(BorrowedShreddingState::new(value, typed_value))
877    }
878}
879
880impl TryFrom<&StructArray> for ShreddingState {
881    type Error = ArrowError;
882
883    fn try_from(inner_struct: &StructArray) -> Result<Self> {
884        Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
885    }
886}
887
888impl From<BorrowedShreddingState<'_>> for ShreddingState {
889    fn from(state: BorrowedShreddingState<'_>) -> Self {
890        ShreddingState {
891            value: state.value_field().cloned(),
892            typed_value: state.typed_value_field().cloned(),
893        }
894    }
895}
896
897/// Builds struct arrays from component fields
898///
899/// TODO: move to arrow crate
900#[derive(Debug, Default, Clone)]
901pub(crate) struct StructArrayBuilder {
902    fields: Vec<FieldRef>,
903    arrays: Vec<ArrayRef>,
904    nulls: Option<NullBuffer>,
905}
906
907impl StructArrayBuilder {
908    pub fn new() -> Self {
909        Default::default()
910    }
911
912    /// Add an array to this struct array as a field with the specified name.
913    pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
914        let field = Field::new(field_name, array.data_type().clone(), nullable);
915        self.fields.push(Arc::new(field));
916        self.arrays.push(array);
917        self
918    }
919
920    /// Set the null buffer for this struct array.
921    pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
922        self.nulls = Some(nulls);
923        self
924    }
925
926    pub fn build(self) -> StructArray {
927        let Self {
928            fields,
929            arrays,
930            nulls,
931        } = self;
932        StructArray::new(Fields::from(fields), arrays, nulls)
933    }
934}
935
936/// returns the non-null element at index as a Variant
937fn typed_value_to_variant<'a>(
938    typed_value: &'a ArrayRef,
939    value: Option<&BinaryViewArray>,
940    index: usize,
941) -> Result<Variant<'a, 'a>> {
942    let data_type = typed_value.data_type();
943    if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
944        // Only a partially shredded struct is allowed to have values for both columns
945        panic!("Invalid variant, conflicting value and typed_value");
946    }
947    match data_type {
948        DataType::Null => Ok(Variant::Null),
949        DataType::Boolean => {
950            let boolean_array = typed_value.as_boolean();
951            let value = boolean_array.value(index);
952            Ok(Variant::from(value))
953        }
954        // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
955        DataType::FixedSizeBinary(16) => {
956            let array = typed_value.as_fixed_size_binary();
957            let value = array.value(index);
958            Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes
959        }
960        DataType::BinaryView => {
961            let array = typed_value.as_binary_view();
962            let value = array.value(index);
963            Ok(Variant::from(value))
964        }
965        DataType::Utf8 => {
966            let array = typed_value.as_string::<i32>();
967            let value = array.value(index);
968            Ok(Variant::from(value))
969        }
970        DataType::LargeUtf8 => {
971            let array = typed_value.as_string::<i64>();
972            let value = array.value(index);
973            Ok(Variant::from(value))
974        }
975        DataType::Utf8View => {
976            let array = typed_value.as_string_view();
977            let value = array.value(index);
978            Ok(Variant::from(value))
979        }
980        DataType::Int8 => {
981            primitive_conversion_single_value!(Int8Type, typed_value, index)
982        }
983        DataType::Int16 => {
984            primitive_conversion_single_value!(Int16Type, typed_value, index)
985        }
986        DataType::Int32 => {
987            primitive_conversion_single_value!(Int32Type, typed_value, index)
988        }
989        DataType::Int64 => {
990            primitive_conversion_single_value!(Int64Type, typed_value, index)
991        }
992        DataType::Float16 => {
993            primitive_conversion_single_value!(Float16Type, typed_value, index)
994        }
995        DataType::Float32 => {
996            primitive_conversion_single_value!(Float32Type, typed_value, index)
997        }
998        DataType::Float64 => {
999            primitive_conversion_single_value!(Float64Type, typed_value, index)
1000        }
1001        DataType::Decimal32(_, s) => {
1002            generic_conversion_single_value_with_result!(
1003                Decimal32Type,
1004                as_primitive,
1005                |v| VariantDecimal4::try_new(v, *s as u8),
1006                typed_value,
1007                index
1008            )
1009        }
1010        DataType::Decimal64(_, s) => {
1011            generic_conversion_single_value_with_result!(
1012                Decimal64Type,
1013                as_primitive,
1014                |v| VariantDecimal8::try_new(v, *s as u8),
1015                typed_value,
1016                index
1017            )
1018        }
1019        DataType::Decimal128(_, s) => {
1020            generic_conversion_single_value_with_result!(
1021                Decimal128Type,
1022                as_primitive,
1023                |v| VariantDecimal16::try_new(v, *s as u8),
1024                typed_value,
1025                index
1026            )
1027        }
1028        DataType::Date32 => {
1029            generic_conversion_single_value!(
1030                Date32Type,
1031                as_primitive,
1032                Date32Type::to_naive_date,
1033                typed_value,
1034                index
1035            )
1036        }
1037        DataType::Time64(TimeUnit::Microsecond) => {
1038            generic_conversion_single_value_with_result!(
1039                Time64MicrosecondType,
1040                as_primitive,
1041                |v| NaiveTime::from_num_seconds_from_midnight_opt(
1042                    (v / 1_000_000) as u32,
1043                    (v % 1_000_000) as u32 * 1000
1044                )
1045                .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1046                typed_value,
1047                index
1048            )
1049        }
1050        DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1051            generic_conversion_single_value!(
1052                TimestampMicrosecondType,
1053                as_primitive,
1054                |v| DateTime::from_timestamp_micros(v).unwrap(),
1055                typed_value,
1056                index
1057            )
1058        }
1059        DataType::Timestamp(TimeUnit::Microsecond, None) => {
1060            generic_conversion_single_value!(
1061                TimestampMicrosecondType,
1062                as_primitive,
1063                |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1064                typed_value,
1065                index
1066            )
1067        }
1068        DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1069            generic_conversion_single_value!(
1070                TimestampNanosecondType,
1071                as_primitive,
1072                DateTime::from_timestamp_nanos,
1073                typed_value,
1074                index
1075            )
1076        }
1077        DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1078            generic_conversion_single_value!(
1079                TimestampNanosecondType,
1080                as_primitive,
1081                |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1082                typed_value,
1083                index
1084            )
1085        }
1086        // todo other types here (note this is very similar to cast_to_variant.rs)
1087        // so it would be great to figure out how to share this code
1088        _ => {
1089            // We shouldn't panic in production code, but this is a
1090            // placeholder until we implement more types
1091            // https://github.com/apache/arrow-rs/issues/8091
1092            debug_assert!(
1093                false,
1094                "Unsupported typed_value type: {}",
1095                typed_value.data_type()
1096            );
1097            Ok(Variant::Null)
1098        }
1099    }
1100}
1101
1102/// Workaround for lack of direct support for BinaryArray
1103/// <https://github.com/apache/arrow-rs/issues/8387>
1104///
1105/// The values are read as
1106/// * `StructArray<metadata: Binary, value: Binary>`
1107///
1108/// but VariantArray needs them as
1109/// * `StructArray<metadata: BinaryView, value: BinaryView>`
1110///
1111/// So cast them to get the right type.
1112fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
1113    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1114    if let Cow::Borrowed(_) = new_type {
1115        if let Some(array) = array.as_struct_opt() {
1116            return Ok(Arc::new(array.clone())); // bypass the unnecessary cast
1117        }
1118    }
1119    cast(array, new_type.as_ref())
1120}
1121
1122/// Recursively visits a data type, ensuring that it only contains data types that can legally
1123/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView,
1124/// since that's what comes back from the parquet reader and what the variant code expects to find.
1125fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1126    use DataType::*;
1127
1128    // helper macros
1129    macro_rules! fail {
1130        () => {
1131            return Err(ArrowError::InvalidArgumentError(format!(
1132                "Illegal shredded value type: {data_type}"
1133            )))
1134        };
1135    }
1136    macro_rules! borrow {
1137        () => {
1138            Cow::Borrowed(data_type)
1139        };
1140    }
1141
1142    let new_data_type = match data_type {
1143        // Primitive arrow types that have a direct variant counterpart are allowed
1144        Null | Boolean => borrow!(),
1145        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1146
1147        // Unsigned integers and half-float are not allowed
1148        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1149
1150        // Most decimal types are allowed, with restrictions on precision and scale
1151        //
1152        // NOTE: arrow-parquet reads widens 32- and 64-bit decimals to 128-bit, but the variant spec
1153        // requires using the narrowest decimal type for a given precision. Fix those up first.
1154        Decimal64(p, s) | Decimal128(p, s)
1155            if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1156        {
1157            Cow::Owned(Decimal32(*p, *s))
1158        }
1159        Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1160            Cow::Owned(Decimal64(*p, *s))
1161        }
1162        Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1163        Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1164        Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1165        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1166
1167        // Only micro and nano timestamps are allowed
1168        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1169        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1170
1171        // Only 32-bit dates and 64-bit microsecond time are allowed.
1172        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1173        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1174
1175        // Binary and string are allowed. Force Binary/LargeBinary to BinaryView because that's what the parquet
1176        // reader returns and what the rest of the variant code expects.
1177        Binary | LargeBinary => Cow::Owned(BinaryView),
1178        BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1179
1180        // UUID maps to 16-byte fixed-size binary; no other width is allowed
1181        FixedSizeBinary(16) => borrow!(),
1182        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1183
1184        // We can _possibly_ allow (some of) these some day?
1185        ListView(_) | LargeList(_) | LargeListView(_) => {
1186            fail!()
1187        }
1188
1189        // Lists and struct are allowed, maps and unions are not
1190        List(field) => match canonicalize_and_verify_field(field)? {
1191            Cow::Borrowed(_) => borrow!(),
1192            Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1193        },
1194        // Struct is used by the internal layout, and can also represent a shredded variant object.
1195        Struct(fields) => {
1196            // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning
1197            // of the data type. Even if some fields change, the others are shallow arc clones.
1198            let mut new_fields = std::collections::HashMap::new();
1199            for (i, field) in fields.iter().enumerate() {
1200                if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1201                    new_fields.insert(i, new_field);
1202                }
1203            }
1204
1205            if new_fields.is_empty() {
1206                borrow!()
1207            } else {
1208                let new_fields = fields
1209                    .iter()
1210                    .enumerate()
1211                    .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1212                Cow::Owned(DataType::Struct(new_fields.collect()))
1213            }
1214        }
1215        Map(..) | Union(..) => fail!(),
1216
1217        // We can _possibly_ support (some of) these some day?
1218        Dictionary(..) | RunEndEncoded(..) => fail!(),
1219    };
1220    Ok(new_data_type)
1221}
1222
1223fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1224    let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1225        return Ok(Cow::Borrowed(field));
1226    };
1227    let new_field = field.as_ref().clone().with_data_type(new_data_type);
1228    Ok(Cow::Owned(Arc::new(new_field)))
1229}
1230
1231#[cfg(test)]
1232mod test {
1233    use crate::VariantArrayBuilder;
1234    use std::str::FromStr;
1235
1236    use super::*;
1237    use arrow::array::{
1238        BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array,
1239        Time64MicrosecondArray,
1240    };
1241    use arrow_schema::{Field, Fields};
1242    use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1243
1244    #[test]
1245    fn invalid_not_a_struct_array() {
1246        let array = make_binary_view_array();
1247        // Should fail because the input is not a StructArray
1248        let err = VariantArray::try_new(&array);
1249        assert_eq!(
1250            err.unwrap_err().to_string(),
1251            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1252        );
1253    }
1254
1255    #[test]
1256    fn invalid_missing_metadata() {
1257        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1258        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1259        // Should fail because the StructArray does not contain a 'metadata' field
1260        let err = VariantArray::try_new(&array);
1261        assert_eq!(
1262            err.unwrap_err().to_string(),
1263            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1264        );
1265    }
1266
1267    #[test]
1268    fn all_null_missing_value_and_typed_value() {
1269        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1270        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1271
1272        // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
1273        // should be invalid, but we currently allow it and treat it as Variant::Null.
1274        // This is a pragmatic decision to handle missing data gracefully.
1275        let variant_array = VariantArray::try_new(&array).unwrap();
1276
1277        // Verify the shredding state is AllNull
1278        assert!(matches!(
1279            variant_array.shredding_state(),
1280            ShreddingState {
1281                value: None,
1282                typed_value: None
1283            }
1284        ));
1285
1286        // Verify that value() returns Variant::Null (compensating for spec violation)
1287        for i in 0..variant_array.len() {
1288            if variant_array.is_valid(i) {
1289                assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1290            }
1291        }
1292    }
1293
1294    #[test]
1295    fn invalid_metadata_field_type() {
1296        let fields = Fields::from(vec![
1297            Field::new("metadata", DataType::Int32, true), // not supported
1298            Field::new("value", DataType::BinaryView, true),
1299        ]);
1300        let array = StructArray::new(
1301            fields,
1302            vec![make_int32_array(), make_binary_view_array()],
1303            None,
1304        );
1305        let err = VariantArray::try_new(&array);
1306        assert_eq!(
1307            err.unwrap_err().to_string(),
1308            "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32"
1309        );
1310    }
1311
1312    #[test]
1313    fn invalid_value_field_type() {
1314        let fields = Fields::from(vec![
1315            Field::new("metadata", DataType::BinaryView, true),
1316            Field::new("value", DataType::Int32, true), // Not yet supported
1317        ]);
1318        let array = StructArray::new(
1319            fields,
1320            vec![make_binary_view_array(), make_int32_array()],
1321            None,
1322        );
1323        let err = VariantArray::try_new(&array);
1324        assert_eq!(
1325            err.unwrap_err().to_string(),
1326            "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32"
1327        );
1328    }
1329
1330    fn make_binary_view_array() -> ArrayRef {
1331        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1332    }
1333
1334    fn make_int32_array() -> ArrayRef {
1335        Arc::new(Int32Array::from(vec![1]))
1336    }
1337
1338    #[test]
1339    fn all_null_shredding_state() {
1340        // Verify the shredding state is AllNull
1341        assert!(matches!(
1342            ShreddingState::new(None, None),
1343            ShreddingState {
1344                value: None,
1345                typed_value: None
1346            }
1347        ));
1348    }
1349
1350    #[test]
1351    fn all_null_variant_array_construction() {
1352        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1353        let nulls = NullBuffer::from(vec![false, false, false]); // all null
1354
1355        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1356        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1357
1358        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1359
1360        // Verify the shredding state is AllNull
1361        assert!(matches!(
1362            variant_array.shredding_state(),
1363            ShreddingState {
1364                value: None,
1365                typed_value: None
1366            }
1367        ));
1368
1369        // Verify all values are null
1370        assert_eq!(variant_array.len(), 3);
1371        assert!(!variant_array.is_valid(0));
1372        assert!(!variant_array.is_valid(1));
1373        assert!(!variant_array.is_valid(2));
1374
1375        // Verify that value() returns Variant::Null for all indices
1376        for i in 0..variant_array.len() {
1377            assert!(
1378                !variant_array.is_valid(i),
1379                "Expected value at index {i} to be null"
1380            );
1381        }
1382    }
1383
1384    #[test]
1385    fn value_field_present_but_all_null_should_be_unshredded() {
1386        // This test demonstrates the issue: when a value field exists in schema
1387        // but all its values are null, it should remain Unshredded, not AllNull
1388        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1389
1390        // Create a value field with all null values
1391        let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
1392        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1393        let value_data = value_array
1394            .to_data()
1395            .into_builder()
1396            .nulls(Some(value_nulls))
1397            .build()
1398            .unwrap();
1399        let value = BinaryViewArray::from(value_data);
1400
1401        let fields = Fields::from(vec![
1402            Field::new("metadata", DataType::BinaryView, false),
1403            Field::new("value", DataType::BinaryView, true), // Field exists in schema
1404        ]);
1405        let struct_array = StructArray::new(
1406            fields,
1407            vec![Arc::new(metadata), Arc::new(value)],
1408            None, // struct itself is not null, just the value field is all null
1409        );
1410
1411        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1412
1413        // This should be Unshredded, not AllNull, because value field exists in schema
1414        assert!(matches!(
1415            variant_array.shredding_state(),
1416            ShreddingState {
1417                value: Some(_),
1418                typed_value: None
1419            }
1420        ));
1421    }
1422
1423    #[test]
1424    fn test_variant_array_iterable() {
1425        let mut b = VariantArrayBuilder::new(6);
1426
1427        b.append_null();
1428        b.append_variant(Variant::from(1_i8));
1429        b.append_variant(Variant::Null);
1430        b.append_variant(Variant::from(2_i32));
1431        b.append_variant(Variant::from(3_i64));
1432        b.append_null();
1433
1434        let v = b.build();
1435
1436        let variants = v.iter().collect::<Vec<_>>();
1437
1438        assert_eq!(
1439            variants,
1440            vec![
1441                None,
1442                Some(Variant::Int8(1)),
1443                Some(Variant::Null),
1444                Some(Variant::Int32(2)),
1445                Some(Variant::Int64(3)),
1446                None,
1447            ]
1448        );
1449    }
1450
1451    #[test]
1452    fn test_variant_array_iter_double_ended() {
1453        let mut b = VariantArrayBuilder::new(5);
1454
1455        b.append_variant(Variant::from(0_i32));
1456        b.append_null();
1457        b.append_variant(Variant::from(2_i32));
1458        b.append_null();
1459        b.append_variant(Variant::from(4_i32));
1460
1461        let array = b.build();
1462        let mut iter = array.iter();
1463
1464        assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1465        assert_eq!(iter.next(), Some(None));
1466
1467        assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1468        assert_eq!(iter.next_back(), Some(None));
1469        assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1470
1471        assert_eq!(iter.next_back(), None);
1472        assert_eq!(iter.next(), None);
1473    }
1474
1475    #[test]
1476    fn test_variant_array_iter_reverse() {
1477        let mut b = VariantArrayBuilder::new(5);
1478
1479        b.append_variant(Variant::from("a"));
1480        b.append_null();
1481        b.append_variant(Variant::from("aaa"));
1482        b.append_null();
1483        b.append_variant(Variant::from("aaaaa"));
1484
1485        let array = b.build();
1486
1487        let result: Vec<_> = array.iter().rev().collect();
1488        assert_eq!(
1489            result,
1490            vec![
1491                Some(Variant::from("aaaaa")),
1492                None,
1493                Some(Variant::from("aaa")),
1494                None,
1495                Some(Variant::from("a")),
1496            ]
1497        );
1498    }
1499
1500    #[test]
1501    fn test_variant_array_iter_empty() {
1502        let v = VariantArrayBuilder::new(0).build();
1503        let mut i = v.iter();
1504        assert!(i.next().is_none());
1505        assert!(i.next_back().is_none());
1506    }
1507
1508    #[test]
1509    fn test_from_variant_opts_into_variant_array() {
1510        let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1511
1512        let variant_array = VariantArray::from_iter(v);
1513
1514        assert_eq!(variant_array.len(), 4);
1515
1516        assert!(variant_array.is_null(0));
1517
1518        assert!(!variant_array.is_null(1));
1519        assert_eq!(variant_array.value(1), Variant::Null);
1520
1521        assert!(!variant_array.is_null(2));
1522        assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1523
1524        assert!(variant_array.is_null(3));
1525    }
1526
1527    #[test]
1528    fn test_from_variants_into_variant_array() {
1529        let v = vec![
1530            Variant::Null,
1531            Variant::BooleanFalse,
1532            Variant::ShortString(ShortString::try_new("norm").unwrap()),
1533        ];
1534
1535        let variant_array = VariantArray::from_iter(v);
1536
1537        assert_eq!(variant_array.len(), 3);
1538
1539        assert!(!variant_array.is_null(0));
1540        assert_eq!(variant_array.value(0), Variant::Null);
1541
1542        assert!(!variant_array.is_null(1));
1543        assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1544
1545        assert!(!variant_array.is_null(2));
1546        assert_eq!(
1547            variant_array.value(2),
1548            Variant::ShortString(ShortString::try_new("norm").unwrap())
1549        );
1550    }
1551
1552    #[test]
1553    fn test_variant_equality() {
1554        let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1555        let v = VariantArray::from_iter(v_iter.clone());
1556
1557        {
1558            let v_copy = v.clone();
1559            assert_eq!(v, v_copy);
1560        }
1561
1562        {
1563            let v_iter_reversed = v_iter.iter().cloned().rev();
1564            let v_reversed = VariantArray::from_iter(v_iter_reversed);
1565
1566            assert_ne!(v, v_reversed);
1567        }
1568
1569        {
1570            let v_sliced = v.slice(0, 1);
1571            assert_ne!(v, v_sliced);
1572        }
1573    }
1574
1575    macro_rules! invalid_variant_array_test {
1576        ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1577            #[test]
1578            fn $fn_name() {
1579                let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1580                    EMPTY_VARIANT_METADATA_BYTES,
1581                    1,
1582                ));
1583                let invalid_typed_value = $invalid_typed_value;
1584
1585                let struct_array = StructArrayBuilder::new()
1586                    .with_field("metadata", Arc::new(metadata), false)
1587                    .with_field("typed_value", Arc::new(invalid_typed_value), true)
1588                    .build();
1589
1590                let array: VariantArray = VariantArray::try_new(&struct_array)
1591                    .expect("should create variant array")
1592                    .into();
1593
1594                let result = array.try_value(0);
1595                assert!(result.is_err());
1596                let error = result.unwrap_err();
1597                assert!(matches!(error, ArrowError::CastError(_)));
1598
1599                let expected: &str = $error_msg;
1600                assert!(
1601                    error.to_string().contains($error_msg),
1602                    "error `{}` did not contain `{}`",
1603                    error,
1604                    expected
1605                )
1606            }
1607        };
1608    }
1609
1610    invalid_variant_array_test!(
1611        test_variant_array_invalide_time,
1612        Time64MicrosecondArray::from(vec![Some(86401000000)]),
1613        "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1614    );
1615
1616    invalid_variant_array_test!(
1617        test_variant_array_invalid_decimal32,
1618        Decimal32Array::from(vec![Some(1234567890)]),
1619        "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1620    );
1621
1622    invalid_variant_array_test!(
1623        test_variant_array_invalid_decimal64,
1624        Decimal64Array::from(vec![Some(1234567890123456789)]),
1625        "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1626    );
1627
1628    invalid_variant_array_test!(
1629        test_variant_array_invalid_decimal128,
1630        Decimal128Array::from(vec![Some(
1631            i128::from_str("123456789012345678901234567890123456789").unwrap()
1632        ),]),
1633        "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1634    );
1635}