Skip to main content

parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22    generic_conversion_single_value, generic_conversion_single_value_with_result,
23    primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29    Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31    TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::{ExtensionType, Uuid as UuidExtension};
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38    Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44/// Returns the raw bytes at the given index from a binary-like array, return `None` if the array isn't binary-like.
45pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8]> {
46    match array.data_type() {
47        DataType::Binary => Some(array.as_binary::<i32>().value(index)),
48        DataType::LargeBinary => Some(array.as_binary::<i64>().value(index)),
49        DataType::BinaryView => Some(array.as_binary_view().value(index)),
50        _ => None,
51    }
52}
53
54/// Returns a [`Variant`] from a `metadata` and `value` byte arrays, returns `None`
55/// if one of them is of invalid type.
56pub(crate) fn variant_from_arrays_at<'m, 'v>(
57    metadata: &'m dyn Array,
58    value: &'v dyn Array,
59    index: usize,
60) -> Option<Variant<'m, 'v>> {
61    let metadata = binary_array_value(metadata, index)?;
62    let value = binary_array_value(value, index)?;
63    Some(Variant::new(metadata, value))
64}
65
66/// Validates that an array has a binary-like data type.
67pub(crate) fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
68    match array.data_type() {
69        DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Ok(()),
70        _ => Err(ArrowError::InvalidArgumentError(format!(
71            "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}",
72            array.data_type()
73        ))),
74    }
75}
76
77/// Arrow Variant [`ExtensionType`].
78///
79/// Represents the canonical Arrow Extension Type for storing variants.
80/// See [`VariantArray`] for more examples of using this extension type.
81pub struct VariantType;
82
83impl ExtensionType for VariantType {
84    const NAME: &'static str = "arrow.parquet.variant";
85
86    // Variants extension metadata is an empty string
87    // <https://github.com/apache/arrow/blob/d803afcc43f5d132506318fd9e162d33b2c3d4cd/docs/source/format/CanonicalExtensions.rst?plain=1#L473>
88    type Metadata = &'static str;
89
90    fn metadata(&self) -> &Self::Metadata {
91        &""
92    }
93
94    fn serialize_metadata(&self) -> Option<String> {
95        Some(String::new())
96    }
97
98    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
99        Ok("")
100    }
101
102    fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
103        if matches!(data_type, DataType::Struct(_)) {
104            Ok(())
105        } else {
106            Err(ArrowError::InvalidArgumentError(format!(
107                "VariantType only supports StructArray, got {data_type}"
108            )))
109        }
110    }
111
112    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
113        Self.supports_data_type(data_type)?;
114        Ok(Self)
115    }
116
117    fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<()> {
118        Self.supports_data_type(data_type)
119    }
120}
121
122/// An array of Parquet [`Variant`] values
123///
124/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
125/// `metadata` and `value` fields, and adds convenience methods to access
126/// the [`Variant`]s.
127///
128/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row.
129///
130/// See the examples below from converting between `VariantArray` and
131/// `StructArray`.
132///
133/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
134///
135/// # Documentation
136///
137/// Variant is documented as a canonical Arrow extension type in the
138/// [Parquet Variant] section of the [official list of extension types] on
139/// the Apache Arrow website.
140///
141/// [Parquet Variant]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#parquet-variant
142/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
143///
144/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension
145///
146/// Arrow Arrays only provide [`DataType`], but the extension type information
147/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or
148/// [`Field`] to check for the extension type.
149///
150/// [`Schema`]: arrow_schema::Schema
151/// ```
152/// # use arrow::array::StructArray;
153/// # use arrow_schema::{Schema, Field, DataType};
154/// # use parquet_variant::Variant;
155/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
156/// # fn get_variant_array() -> VariantArray {
157/// #   let mut builder = VariantArrayBuilder::new(10);
158/// #   builder.append_variant(Variant::from("such wow"));
159/// #   builder.build()
160/// # }
161/// # fn get_schema() -> Schema {
162/// #   Schema::new(vec![
163/// #     Field::new("id", DataType::Int32, false),
164/// #     get_variant_array().field("var"),
165/// #   ])
166/// # }
167/// let schema = get_schema();
168/// assert_eq!(schema.fields().len(), 2);
169/// // first field is not a Variant
170/// assert!(!schema.field(0).has_valid_extension_type::<VariantType>());
171/// // second field is a Variant
172/// assert!(schema.field(1).has_valid_extension_type::<VariantType>());
173/// ```
174///
175/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
176///
177/// You can construct the correct [`Field`] for a [`VariantArray`] using the
178/// [`VariantArray::field`] method.
179///
180/// ```
181/// # use arrow_schema::{Schema, Field, DataType};
182/// # use parquet_variant::Variant;
183/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
184/// # fn get_variant_array() -> VariantArray {
185/// #   let mut builder = VariantArrayBuilder::new(10);
186/// #   builder.append_variant(Variant::from("such wow"));
187/// #   builder.build()
188/// # }
189/// let variant_array = get_variant_array();
190/// // First field is an integer id, second field is a variant
191/// let schema = Schema::new(vec![
192///   Field::new("id", DataType::Int32, false),
193///   // call VariantArray::field to get the correct Field
194///   variant_array.field("var"),
195/// ]);
196/// ```
197///
198/// You can also construct the [`Field`] using [`VariantType`] directly
199///
200/// ```
201/// # use arrow_schema::{Schema, Field, DataType};
202/// # use parquet_variant::Variant;
203/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
204/// # fn get_variant_array() -> VariantArray {
205/// #   let mut builder = VariantArrayBuilder::new(10);
206/// #   builder.append_variant(Variant::from("such wow"));
207/// #   builder.build()
208/// # }
209/// # let variant_array = get_variant_array();
210/// // The DataType of a VariantArray varies depending on how it is shredded
211/// let data_type = variant_array.data_type().clone();
212/// // First field is an integer id, second field is a variant
213/// let schema = Schema::new(vec![
214///   Field::new("id", DataType::Int32, false),
215///   Field::new("var", data_type, false)
216///     // Add extension metadata to the field using `VariantType`
217///     .with_extension_type(VariantType),
218/// ]);
219/// ```
220///
221/// # Example: Converting a [`VariantArray`] to a [`StructArray`]
222///
223/// ```
224/// # use arrow::array::StructArray;
225/// # use parquet_variant::Variant;
226/// # use parquet_variant_compute::VariantArrayBuilder;
227/// // Create Variant Array
228/// let mut builder = VariantArrayBuilder::new(10);
229/// builder.append_variant(Variant::from("such wow"));
230/// let variant_array = builder.build();
231/// // convert to StructArray
232/// let struct_array: StructArray = variant_array.into();
233/// ```
234///
235/// # Example: Converting a [`StructArray`] to a [`VariantArray`]
236///
237/// ```
238/// # use arrow::array::StructArray;
239/// # use parquet_variant::Variant;
240/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray};
241/// # fn get_struct_array() -> StructArray {
242/// #   let mut builder = VariantArrayBuilder::new(10);
243/// #   builder.append_variant(Variant::from("such wow"));
244/// #   builder.build().into()
245/// # }
246/// let struct_array: StructArray = get_struct_array();
247/// // try and create a VariantArray from it
248/// let variant_array = VariantArray::try_new(&struct_array).unwrap();
249/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
250/// ```
251///
252#[derive(Debug, Clone)]
253pub struct VariantArray {
254    /// Reference to the underlying StructArray
255    inner: StructArray,
256
257    /// The metadata column of this variant (Binary, LargeBinary, or BinaryView)
258    metadata: ArrayRef,
259
260    /// how is this variant array shredded?
261    shredding_state: ShreddingState,
262}
263
264impl VariantArray {
265    /// Creates a new `VariantArray` from a [`StructArray`].
266    ///
267    /// # Arguments
268    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
269    ///
270    /// # Returns
271    /// - A new instance of `VariantArray`.
272    ///
273    /// # Errors:
274    /// - If the `StructArray` does not contain the required fields
275    ///
276    /// # Requirements of the `StructArray`
277    ///
278    /// 1. A required field named `metadata` which is binary, large_binary, or
279    ///    binary_view
280    ///
281    /// 2. An optional field named `value` that is binary, large_binary, or
282    ///    binary_view
283    ///
284    /// 3. An optional field named `typed_value` which can be any primitive type
285    ///    or be a list, large_list, list_view or struct
286    ///
287    /// NOTE: It is also permissible for the metadata field to be
288    /// Dictionary-Encoded, preferably (but not required) with an index type of
289    /// int8.
290    ///
291    pub fn try_new(inner: &dyn Array) -> Result<Self> {
292        // Canonicalize shredded typed_value fields (e.g. decimal narrowing)
293        let inner = canonicalize_shredded_types(inner)?;
294
295        let Some(inner) = inner.as_struct_opt() else {
296            return Err(ArrowError::InvalidArgumentError(
297                "Invalid VariantArray: requires StructArray as input".to_string(),
298            ));
299        };
300
301        // Note the specification allows for any order so we must search by name
302
303        // Ensure the StructArray has a metadata field that is a binary type
304        let Some(metadata_col) = inner.column_by_name("metadata") else {
305            return Err(ArrowError::InvalidArgumentError(
306                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
307            ));
308        };
309        validate_binary_array(metadata_col.as_ref(), "metadata")?;
310
311        // Note these clones are cheap, they just bump the ref count
312        Ok(Self {
313            inner: inner.clone(),
314            metadata: metadata_col.clone(),
315            shredding_state: ShreddingState::try_from(inner)?,
316        })
317    }
318
319    pub(crate) fn from_parts(
320        metadata: ArrayRef,
321        value: Option<ArrayRef>,
322        typed_value: Option<ArrayRef>,
323        nulls: Option<NullBuffer>,
324    ) -> Self {
325        let mut builder = StructArrayBuilder::new().with_field("metadata", metadata.clone(), false);
326        if let Some(value) = value.clone() {
327            builder = builder.with_field("value", value, true);
328        }
329        if let Some(typed_value) = typed_value.clone() {
330            builder = builder.with_field_ref(typed_value_field(&typed_value), typed_value);
331        }
332        if let Some(nulls) = nulls {
333            builder = builder.with_nulls(nulls);
334        }
335
336        Self {
337            inner: builder.build(),
338            metadata,
339            shredding_state: ShreddingState::new(value, typed_value),
340        }
341    }
342
343    /// Returns a reference to the underlying [`StructArray`].
344    pub fn inner(&self) -> &StructArray {
345        &self.inner
346    }
347
348    /// Returns the inner [`StructArray`], consuming self
349    pub fn into_inner(self) -> StructArray {
350        self.inner
351    }
352
353    /// Return the shredding state of this `VariantArray`
354    pub fn shredding_state(&self) -> &ShreddingState {
355        &self.shredding_state
356    }
357
358    /// Return the [`Variant`] instance stored at the given row
359    ///
360    /// This is a convenience wrapper that calls [`VariantArray::try_value`] and unwraps the `Result`.
361    /// Use `try_value` if you need to handle conversion errors gracefully.
362    ///
363    /// # Panics
364    /// * if the index is out of bounds
365    /// * if the array value is null
366    /// * if `try_value` returns an error.
367    pub fn value(&self, index: usize) -> Variant<'_, '_> {
368        self.try_value(index).unwrap()
369    }
370
371    /// Return the [`Variant`] instance stored at the given row
372    ///
373    /// Note: This method does not check for nulls and the value is arbitrary
374    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
375    ///
376    /// # Panics
377    ///
378    /// Panics if
379    /// * the index is out of bounds
380    /// * the array value is null
381    ///
382    /// # Errors
383    ///
384    /// Errors if
385    /// - the data in `typed_value` cannot be interpreted as a valid `Variant`
386    ///
387    /// If this is a shredded variant but has no value at the shredded location, it
388    /// will return [`Variant::Null`].
389    ///
390    ///
391    /// # Performance Note
392    ///
393    /// This is certainly not the most efficient way to access values in a
394    /// `VariantArray`, but it is useful for testing and debugging.
395    ///
396    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
397    /// caller to ensure that the metadata and value were constructed correctly.
398    pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
399        match (self.typed_value_field(), self.value_field()) {
400            // Always prefer typed_value, if available
401            (Some(typed_value), value) if typed_value.is_valid(index) => {
402                typed_value_to_variant(typed_value, value, index)
403            }
404            // Otherwise fall back to value, if available
405            (_, Some(value)) if value.is_valid(index) => variant_from_arrays_at(
406                &self.metadata,
407                value,
408                index,
409            )
410            .ok_or_else(|| {
411                ArrowError::InvalidArgumentError(format!(
412                    "metadata and value fields must be binary-like arrays, instead got {} and {}",
413                    self.metadata.data_type(),
414                    value.data_type()
415                ))
416            }),
417            // It is technically invalid for neither value nor typed_value fields to be available,
418            // but the spec specifically requires readers to return Variant::Null in this case.
419            _ => Ok(Variant::Null),
420        }
421    }
422
423    /// Return a reference to the metadata field of the [`StructArray`]
424    pub fn metadata_field(&self) -> &ArrayRef {
425        &self.metadata
426    }
427
428    /// Return a reference to the value field of the `StructArray`
429    pub fn value_field(&self) -> Option<&ArrayRef> {
430        self.shredding_state.value_field()
431    }
432
433    /// Return a reference to the typed_value field of the `StructArray`, if present
434    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
435        self.shredding_state.typed_value_field()
436    }
437
438    /// Return a field to represent this VariantArray in a `Schema` with
439    /// a particular name
440    pub fn field(&self, name: impl Into<String>) -> Field {
441        Field::new(
442            name.into(),
443            self.data_type().clone(),
444            self.inner.is_nullable(),
445        )
446        .with_extension_type(VariantType)
447    }
448
449    /// Returns a new DataType representing this VariantArray's inner type
450    pub fn data_type(&self) -> &DataType {
451        self.inner.data_type()
452    }
453
454    pub fn slice(&self, offset: usize, length: usize) -> Self {
455        let inner = self.inner.slice(offset, length);
456        let metadata = self.metadata.slice(offset, length);
457        let shredding_state = self.shredding_state.slice(offset, length);
458        Self {
459            inner,
460            metadata,
461            shredding_state,
462        }
463    }
464
465    pub fn len(&self) -> usize {
466        self.inner.len()
467    }
468
469    pub fn is_empty(&self) -> bool {
470        self.inner.is_empty()
471    }
472
473    pub fn nulls(&self) -> Option<&NullBuffer> {
474        self.inner.nulls()
475    }
476
477    /// Is the element at index null?
478    pub fn is_null(&self, index: usize) -> bool {
479        self.nulls().is_some_and(|n| n.is_null(index))
480    }
481
482    /// Is the element at index valid (not null)?
483    pub fn is_valid(&self, index: usize) -> bool {
484        !self.is_null(index)
485    }
486
487    /// Returns an iterator over the values in this array
488    pub fn iter(&self) -> VariantArrayIter<'_> {
489        VariantArrayIter::new(self)
490    }
491}
492
493impl PartialEq for VariantArray {
494    fn eq(&self, other: &Self) -> bool {
495        self.inner == other.inner
496    }
497}
498
499impl From<VariantArray> for StructArray {
500    fn from(variant_array: VariantArray) -> Self {
501        variant_array.into_inner()
502    }
503}
504
505impl From<VariantArray> for ArrayRef {
506    fn from(variant_array: VariantArray) -> Self {
507        Arc::new(variant_array.into_inner())
508    }
509}
510
511impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
512    fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
513        let iter = iter.into_iter();
514
515        let mut b = VariantArrayBuilder::new(iter.size_hint().0);
516        b.extend(iter);
517        b.build()
518    }
519}
520
521impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
522    fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
523        Self::from_iter(iter.into_iter().map(Some))
524    }
525}
526
527/// An iterator over [`VariantArray`]
528///
529/// This iterator returns `Option<Option<Variant<'a, 'a>>>` where:
530/// - `None` indicates the end of iteration
531/// - `Some(None)` indicates a null value at this position
532/// - `Some(Some(variant))` indicates a valid variant value
533///
534/// # Example
535///
536/// ```
537/// # use parquet_variant::Variant;
538/// # use parquet_variant_compute::VariantArrayBuilder;
539/// let mut builder = VariantArrayBuilder::new(10);
540/// builder.append_variant(Variant::from(42));
541/// builder.append_null();
542/// builder.append_variant(Variant::from("hello"));
543/// let array = builder.build();
544///
545/// let values = array.iter().collect::<Vec<_>>();
546/// assert_eq!(values.len(), 3);
547/// assert_eq!(values[0], Some(Variant::from(42)));
548/// assert_eq!(values[1], None);
549/// assert_eq!(values[2], Some(Variant::from("hello")));
550/// ```
551#[derive(Debug)]
552pub struct VariantArrayIter<'a> {
553    array: &'a VariantArray,
554    head_i: usize,
555    tail_i: usize,
556}
557
558impl<'a> VariantArrayIter<'a> {
559    /// Creates a new iterator over the given [`VariantArray`]
560    pub fn new(array: &'a VariantArray) -> Self {
561        Self {
562            array,
563            head_i: 0,
564            tail_i: array.len(),
565        }
566    }
567
568    fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
569        self.array.is_valid(i).then(|| self.array.value(i))
570    }
571}
572
573impl<'a> Iterator for VariantArrayIter<'a> {
574    type Item = Option<Variant<'a, 'a>>;
575
576    #[inline]
577    fn next(&mut self) -> Option<Self::Item> {
578        if self.head_i == self.tail_i {
579            return None;
580        }
581
582        let out = self.value_opt(self.head_i);
583
584        self.head_i += 1;
585
586        Some(out)
587    }
588
589    fn size_hint(&self) -> (usize, Option<usize>) {
590        let remainder = self.tail_i - self.head_i;
591
592        (remainder, Some(remainder))
593    }
594}
595
596impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
597    fn next_back(&mut self) -> Option<Self::Item> {
598        if self.head_i == self.tail_i {
599            return None;
600        }
601
602        self.tail_i -= 1;
603
604        Some(self.value_opt(self.tail_i))
605    }
606}
607
608impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
609
610/// One shredded field of a partially or perfectly shredded variant. For example, suppose the
611/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
612/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
613/// is:
614///
615/// ```text
616/// v: VARIANT {
617///     metadata: BINARY,
618///     value: BINARY,
619///     typed_value: STRUCT {
620///         a: SHREDDED_VARIANT_FIELD {
621///             value: BINARY,
622///             typed_value: STRUCT {
623///                 a: SHREDDED_VARIANT_FIELD {
624///                     value: BINARY,
625///                     typed_value: INT,
626///                 },
627///             },
628///         },
629///     },
630/// }
631/// ```
632///
633/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
634/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
635/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
636/// single expected field `a`).
637///
638/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
639/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
640/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
641///
642/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
643/// variant value (which could be `Variant::Null`).
644#[derive(Debug)]
645pub struct ShreddedVariantFieldArray {
646    /// Reference to the underlying StructArray
647    inner: StructArray,
648    shredding_state: ShreddingState,
649}
650
651#[allow(unused)]
652impl ShreddedVariantFieldArray {
653    /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
654    ///
655    /// # Arguments
656    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
657    ///
658    /// # Returns
659    /// - A new instance of `ShreddedVariantFieldArray`.
660    ///
661    /// # Errors:
662    /// - If the `StructArray` does not contain the required fields
663    ///
664    /// # Requirements of the `StructArray`
665    ///
666    /// 1. An optional field named `value` that is binary, large_binary, or
667    ///    binary_view
668    ///
669    /// 2. An optional field named `typed_value` which can be any primitive type
670    ///    or be a list, large_list, list_view or struct
671    ///
672    pub fn try_new(inner: &dyn Array) -> Result<Self> {
673        let Some(inner_struct) = inner.as_struct_opt() else {
674            return Err(ArrowError::InvalidArgumentError(
675                "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
676            ));
677        };
678
679        // Note this clone is cheap, it just bumps the ref count
680        Ok(Self {
681            inner: inner_struct.clone(),
682            shredding_state: ShreddingState::try_from(inner_struct)?,
683        })
684    }
685
686    /// Return the shredding state of this `VariantArray`
687    pub fn shredding_state(&self) -> &ShreddingState {
688        &self.shredding_state
689    }
690
691    /// Return a reference to the value field of the `StructArray`
692    pub fn value_field(&self) -> Option<&ArrayRef> {
693        self.shredding_state.value_field()
694    }
695
696    /// Return a reference to the typed_value field of the `StructArray`, if present
697    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
698        self.shredding_state.typed_value_field()
699    }
700
701    /// Returns a reference to the underlying [`StructArray`].
702    pub fn inner(&self) -> &StructArray {
703        &self.inner
704    }
705
706    pub(crate) fn from_parts(
707        value: Option<ArrayRef>,
708        typed_value: Option<ArrayRef>,
709        nulls: Option<NullBuffer>,
710    ) -> Self {
711        let mut builder = StructArrayBuilder::new();
712        if let Some(value) = value.clone() {
713            builder = builder.with_field("value", value, true);
714        }
715        if let Some(typed_value) = typed_value.clone() {
716            builder = builder.with_field_ref(typed_value_field(&typed_value), typed_value);
717        }
718        if let Some(nulls) = nulls {
719            builder = builder.with_nulls(nulls);
720        }
721
722        Self {
723            inner: builder.build(),
724            shredding_state: ShreddingState::new(value, typed_value),
725        }
726    }
727
728    /// Returns the inner [`StructArray`], consuming self
729    pub fn into_inner(self) -> StructArray {
730        self.inner
731    }
732
733    pub fn data_type(&self) -> &DataType {
734        self.inner.data_type()
735    }
736
737    pub fn len(&self) -> usize {
738        self.inner.len()
739    }
740
741    pub fn is_empty(&self) -> bool {
742        self.inner.is_empty()
743    }
744
745    pub fn offset(&self) -> usize {
746        self.inner.offset()
747    }
748
749    pub fn nulls(&self) -> Option<&NullBuffer> {
750        // According to the shredding spec, ShreddedVariantFieldArray should be
751        // physically non-nullable - SQL NULL is inferred by both value and
752        // typed_value being physically NULL
753        None
754    }
755    /// Is the element at index null?
756    pub fn is_null(&self, index: usize) -> bool {
757        self.nulls().is_some_and(|n| n.is_null(index))
758    }
759
760    /// Is the element at index valid (not null)?
761    pub fn is_valid(&self, index: usize) -> bool {
762        !self.is_null(index)
763    }
764}
765
766impl From<ShreddedVariantFieldArray> for ArrayRef {
767    fn from(array: ShreddedVariantFieldArray) -> Self {
768        Arc::new(array.into_inner())
769    }
770}
771
772impl From<ShreddedVariantFieldArray> for StructArray {
773    fn from(array: ShreddedVariantFieldArray) -> Self {
774        array.into_inner()
775    }
776}
777
778/// Represents the shredding state of a [`VariantArray`]
779///
780/// [`VariantArray`]s can be shredded according to the [Parquet Variant
781/// Shredding Spec]. Shredding means that the actual value is stored in a typed
782/// `typed_field` instead of the generic `value` field.
783///
784/// Both value and typed_value are optional fields used together to encode a
785/// single value. Values in the two fields must be interpreted according to the
786/// following table (see [Parquet Variant Shredding Spec] for more details):
787///
788/// | value    | typed_value  | Meaning |
789/// |----------|--------------|---------|
790/// | NULL     | NULL         | The value is missing; only valid for shredded object fields |
791/// | non-NULL | NULL         | The value is present and may be any type, including [`Variant::Null`] |
792/// | NULL     | non-NULL     | The value is present and is the shredded type |
793/// | non-NULL | non-NULL     | The value is present and is a partially shredded object |
794///
795///
796/// Applying the above rules to entire columns, we obtain the following:
797///
798/// | value  | typed_value  | Meaning |
799/// |--------|-------------|---------|
800/// | --     | --          | **Missing**: The value is always missing; only valid for shredded object fields |
801/// | exists | --          | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
802/// | --     | exists      | **Perfectly shredded**: If present, the value is always the shredded type |
803/// | exists | exists      | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
804///
805/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
806/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
807/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
808/// (partial shredding).
809///
810/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
811#[derive(Debug, Clone)]
812pub struct ShreddingState {
813    value: Option<ArrayRef>,
814    typed_value: Option<ArrayRef>,
815}
816
817impl ShreddingState {
818    /// Create a new `ShreddingState` from the given `value` and `typed_value` fields
819    ///
820    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
821    /// `ShreddingState::try_from(&struct_array)`, for example:
822    ///
823    /// ```no_run
824    /// # use arrow::array::StructArray;
825    /// # use parquet_variant_compute::ShreddingState;
826    /// # fn get_struct_array() -> StructArray {
827    /// #   unimplemented!()
828    /// # }
829    /// let struct_array: StructArray = get_struct_array();
830    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
831    /// ```
832    pub fn new(value: Option<ArrayRef>, typed_value: Option<ArrayRef>) -> Self {
833        Self { value, typed_value }
834    }
835
836    /// Return a reference to the value field, if present
837    pub fn value_field(&self) -> Option<&ArrayRef> {
838        self.value.as_ref()
839    }
840
841    /// Return a reference to the typed_value field, if present
842    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
843        self.typed_value.as_ref()
844    }
845
846    /// Slice all the underlying arrays
847    pub fn slice(&self, offset: usize, length: usize) -> Self {
848        Self {
849            value: self.value.as_ref().map(|v| v.slice(offset, length)),
850            typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
851        }
852    }
853}
854
855impl TryFrom<&StructArray> for ShreddingState {
856    type Error = ArrowError;
857
858    fn try_from(inner_struct: &StructArray) -> Result<Self> {
859        // The `value` column need not exist, but if it does it must be a binary type.
860        let value = if let Some(value_col) = inner_struct.column_by_name("value") {
861            validate_binary_array(value_col.as_ref(), "value")?;
862            Some(value_col.clone())
863        } else {
864            None
865        };
866        let typed_value = inner_struct.column_by_name("typed_value").cloned();
867        Ok(ShreddingState::new(value, typed_value))
868    }
869}
870
871/// Build the `typed_value` [`FieldRef`] for a shredded column.
872///
873/// The Variant spec maps `FixedSizeBinary(16)` exclusively to UUID, so any
874/// shredded column of that type must carry the canonical [`UuidExtension`]
875/// extension metadata on its field.
876fn typed_value_field(array: &ArrayRef) -> FieldRef {
877    let mut field = Field::new("typed_value", array.data_type().clone(), true);
878    if matches!(array.data_type(), DataType::FixedSizeBinary(16)) {
879        field = field.with_extension_type(UuidExtension);
880    }
881    Arc::new(field)
882}
883
884/// Builds struct arrays from component fields
885///
886/// TODO: move to arrow crate
887#[derive(Debug, Default, Clone)]
888pub(crate) struct StructArrayBuilder {
889    fields: Vec<FieldRef>,
890    arrays: Vec<ArrayRef>,
891    nulls: Option<NullBuffer>,
892}
893
894impl StructArrayBuilder {
895    pub fn new() -> Self {
896        Default::default()
897    }
898
899    /// Add an array to this struct array as a field with the specified name.
900    pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
901        let field = Field::new(field_name, array.data_type().clone(), nullable);
902        self.fields.push(Arc::new(field));
903        self.arrays.push(array);
904        self
905    }
906
907    /// Add an array to this struct array using a caller-supplied [`FieldRef`].
908    ///
909    /// Use this when the field carries metadata (e.g. an extension type) that
910    /// would be lost if the field were synthesized from the array's data type alone.
911    pub fn with_field_ref(mut self, field: FieldRef, array: ArrayRef) -> Self {
912        self.fields.push(field);
913        self.arrays.push(array);
914        self
915    }
916
917    /// Set the null buffer for this struct array.
918    pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
919        self.nulls = Some(nulls);
920        self
921    }
922
923    pub fn build(self) -> StructArray {
924        let Self {
925            fields,
926            arrays,
927            nulls,
928        } = self;
929        StructArray::new(Fields::from(fields), arrays, nulls)
930    }
931}
932
933/// returns the non-null element at index as a Variant
934fn typed_value_to_variant<'a>(
935    typed_value: &'a ArrayRef,
936    value: Option<&'a ArrayRef>,
937    index: usize,
938) -> Result<Variant<'a, 'a>> {
939    let data_type = typed_value.data_type();
940    if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
941        // Only a partially shredded struct is allowed to have values for both columns
942        panic!("Invalid variant, conflicting value and typed_value");
943    }
944    match data_type {
945        DataType::Null => Ok(Variant::Null),
946        DataType::Boolean => {
947            let boolean_array = typed_value.as_boolean();
948            let value = boolean_array.value(index);
949            Ok(Variant::from(value))
950        }
951        // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
952        DataType::FixedSizeBinary(16) => {
953            let array = typed_value.as_fixed_size_binary();
954            let value = array.value(index);
955            Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes
956        }
957        DataType::Binary => {
958            let array = typed_value.as_binary::<i32>();
959            let value = array.value(index);
960            Ok(Variant::from(value))
961        }
962        DataType::LargeBinary => {
963            let array = typed_value.as_binary::<i64>();
964            let value = array.value(index);
965            Ok(Variant::from(value))
966        }
967        DataType::BinaryView => {
968            let array = typed_value.as_binary_view();
969            let value = array.value(index);
970            Ok(Variant::from(value))
971        }
972        DataType::Utf8 => {
973            let array = typed_value.as_string::<i32>();
974            let value = array.value(index);
975            Ok(Variant::from(value))
976        }
977        DataType::LargeUtf8 => {
978            let array = typed_value.as_string::<i64>();
979            let value = array.value(index);
980            Ok(Variant::from(value))
981        }
982        DataType::Utf8View => {
983            let array = typed_value.as_string_view();
984            let value = array.value(index);
985            Ok(Variant::from(value))
986        }
987        DataType::Int8 => {
988            primitive_conversion_single_value!(Int8Type, typed_value, index)
989        }
990        DataType::Int16 => {
991            primitive_conversion_single_value!(Int16Type, typed_value, index)
992        }
993        DataType::Int32 => {
994            primitive_conversion_single_value!(Int32Type, typed_value, index)
995        }
996        DataType::Int64 => {
997            primitive_conversion_single_value!(Int64Type, typed_value, index)
998        }
999        DataType::Float16 => {
1000            primitive_conversion_single_value!(Float16Type, typed_value, index)
1001        }
1002        DataType::Float32 => {
1003            primitive_conversion_single_value!(Float32Type, typed_value, index)
1004        }
1005        DataType::Float64 => {
1006            primitive_conversion_single_value!(Float64Type, typed_value, index)
1007        }
1008        DataType::Decimal32(_, s) => {
1009            generic_conversion_single_value_with_result!(
1010                Decimal32Type,
1011                as_primitive,
1012                |v| VariantDecimal4::try_new(v, *s as u8),
1013                typed_value,
1014                index
1015            )
1016        }
1017        DataType::Decimal64(_, s) => {
1018            generic_conversion_single_value_with_result!(
1019                Decimal64Type,
1020                as_primitive,
1021                |v| VariantDecimal8::try_new(v, *s as u8),
1022                typed_value,
1023                index
1024            )
1025        }
1026        DataType::Decimal128(_, s) => {
1027            generic_conversion_single_value_with_result!(
1028                Decimal128Type,
1029                as_primitive,
1030                |v| VariantDecimal16::try_new(v, *s as u8),
1031                typed_value,
1032                index
1033            )
1034        }
1035        DataType::Date32 => {
1036            generic_conversion_single_value!(
1037                Date32Type,
1038                as_primitive,
1039                |v| Date32Type::to_naive_date_opt(v).unwrap(),
1040                typed_value,
1041                index
1042            )
1043        }
1044        DataType::Time64(TimeUnit::Microsecond) => {
1045            generic_conversion_single_value_with_result!(
1046                Time64MicrosecondType,
1047                as_primitive,
1048                |v| NaiveTime::from_num_seconds_from_midnight_opt(
1049                    (v / 1_000_000) as u32,
1050                    (v % 1_000_000) as u32 * 1000
1051                )
1052                .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1053                typed_value,
1054                index
1055            )
1056        }
1057        DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1058            generic_conversion_single_value!(
1059                TimestampMicrosecondType,
1060                as_primitive,
1061                |v| DateTime::from_timestamp_micros(v).unwrap(),
1062                typed_value,
1063                index
1064            )
1065        }
1066        DataType::Timestamp(TimeUnit::Microsecond, None) => {
1067            generic_conversion_single_value!(
1068                TimestampMicrosecondType,
1069                as_primitive,
1070                |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1071                typed_value,
1072                index
1073            )
1074        }
1075        DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1076            generic_conversion_single_value!(
1077                TimestampNanosecondType,
1078                as_primitive,
1079                DateTime::from_timestamp_nanos,
1080                typed_value,
1081                index
1082            )
1083        }
1084        DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1085            generic_conversion_single_value!(
1086                TimestampNanosecondType,
1087                as_primitive,
1088                |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1089                typed_value,
1090                index
1091            )
1092        }
1093        // todo other types here (note this is very similar to cast_to_variant.rs)
1094        // so it would be great to figure out how to share this code
1095        _ => {
1096            // We shouldn't panic in production code, but this is a
1097            // placeholder until we implement more types
1098            // https://github.com/apache/arrow-rs/issues/8091
1099            debug_assert!(
1100                false,
1101                "Unsupported typed_value type: {}",
1102                typed_value.data_type()
1103            );
1104            Ok(Variant::Null)
1105        }
1106    }
1107}
1108
1109/// Canonicalize shredded typed_value fields (e.g. decimal narrowing) and
1110/// verify that all data types in the struct are legal for a variant array.
1111fn canonicalize_shredded_types(array: &dyn Array) -> Result<ArrayRef> {
1112    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1113    if let Cow::Borrowed(_) = new_type {
1114        if let Some(array) = array.as_struct_opt() {
1115            return Ok(Arc::new(array.clone())); // bypass the unnecessary cast
1116        }
1117    }
1118    cast(array, new_type.as_ref())
1119}
1120
1121/// Recursively visits a data type, ensuring that it only contains data types that can legally
1122/// appear in a (possibly shredded) variant array. It also narrows decimal types to the smallest
1123/// valid precision (e.g. Decimal128 -> Decimal32 when the precision fits).
1124fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1125    use DataType::*;
1126
1127    // helper macros
1128    macro_rules! fail {
1129        () => {
1130            return Err(ArrowError::InvalidArgumentError(format!(
1131                "Illegal shredded value type: {data_type}"
1132            )))
1133        };
1134    }
1135    macro_rules! borrow {
1136        () => {
1137            Cow::Borrowed(data_type)
1138        };
1139    }
1140
1141    let new_data_type = match data_type {
1142        // Primitive arrow types that have a direct variant counterpart are allowed
1143        Null | Boolean => borrow!(),
1144        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1145
1146        // Unsigned integers and half-float are not allowed
1147        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1148
1149        // Most decimal types are allowed, with restrictions on precision and scale
1150        //
1151        // NOTE: arrow-parquet reads widens 32- and 64-bit decimals to 128-bit, but the variant spec
1152        // requires using the narrowest decimal type for a given precision. Fix those up first.
1153        Decimal64(p, s) | Decimal128(p, s)
1154            if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1155        {
1156            Cow::Owned(Decimal32(*p, *s))
1157        }
1158        Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1159            Cow::Owned(Decimal64(*p, *s))
1160        }
1161        Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1162        Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1163        Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1164        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1165
1166        // Only micro and nano timestamps are allowed
1167        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1168        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1169
1170        // Only 32-bit dates and 64-bit microsecond time are allowed.
1171        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1172        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1173
1174        // Binary, string, and their view counterparts are allowed.
1175        Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1176
1177        // UUID maps to 16-byte fixed-size binary; no other width is allowed
1178        FixedSizeBinary(16) => borrow!(),
1179        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1180
1181        // List-like containers and struct are allowed, maps and unions are not
1182        List(field) => match canonicalize_and_verify_field(field)? {
1183            Cow::Borrowed(_) => borrow!(),
1184            Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1185        },
1186        LargeList(field) => match canonicalize_and_verify_field(field)? {
1187            Cow::Borrowed(_) => borrow!(),
1188            Cow::Owned(new_field) => Cow::Owned(DataType::LargeList(new_field)),
1189        },
1190        ListView(field) => match canonicalize_and_verify_field(field)? {
1191            Cow::Borrowed(_) => borrow!(),
1192            Cow::Owned(new_field) => Cow::Owned(DataType::ListView(new_field)),
1193        },
1194        LargeListView(field) => match canonicalize_and_verify_field(field)? {
1195            Cow::Borrowed(_) => borrow!(),
1196            Cow::Owned(new_field) => Cow::Owned(DataType::LargeListView(new_field)),
1197        },
1198        // Struct is used by the internal layout, and can also represent a shredded variant object.
1199        Struct(fields) => {
1200            // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning
1201            // of the data type. Even if some fields change, the others are shallow arc clones.
1202            let mut new_fields = std::collections::HashMap::new();
1203            for (i, field) in fields.iter().enumerate() {
1204                if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1205                    new_fields.insert(i, new_field);
1206                }
1207            }
1208
1209            if new_fields.is_empty() {
1210                borrow!()
1211            } else {
1212                let new_fields = fields
1213                    .iter()
1214                    .enumerate()
1215                    .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1216                Cow::Owned(DataType::Struct(new_fields.collect()))
1217            }
1218        }
1219        Map(..) | Union(..) => fail!(),
1220
1221        // We can _possibly_ support (some of) these some day?
1222        Dictionary(..) | RunEndEncoded(..) => fail!(),
1223    };
1224    Ok(new_data_type)
1225}
1226
1227fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1228    let new_data_type = canonicalize_and_verify_data_type(field.data_type())?;
1229
1230    // A shredded FixedSizeBinary(16) column is always a UUID. Tag it with the UUID extension type
1231    // on read, as a safety net against writers that emit the column without the extension metadata.
1232    // Canonicalization never rewrites FixedSizeBinary(16), so the type is already correct here.
1233    if matches!(new_data_type.as_ref(), DataType::FixedSizeBinary(16))
1234        && !field.has_valid_extension_type::<UuidExtension>()
1235    {
1236        let new_field = field.as_ref().clone().with_extension_type(UuidExtension);
1237        return Ok(Cow::Owned(Arc::new(new_field)));
1238    }
1239
1240    let Cow::Owned(new_data_type) = new_data_type else {
1241        return Ok(Cow::Borrowed(field));
1242    };
1243    let new_field = field.as_ref().clone().with_data_type(new_data_type);
1244    Ok(Cow::Owned(Arc::new(new_field)))
1245}
1246
1247#[cfg(test)]
1248mod test {
1249    use crate::VariantArrayBuilder;
1250    use std::str::FromStr;
1251
1252    use super::*;
1253    use arrow::array::{
1254        BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array,
1255        FixedSizeBinaryArray, Int32Array, Int64Array, LargeBinaryArray, LargeListArray,
1256        LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray,
1257    };
1258    use arrow::buffer::{OffsetBuffer, ScalarBuffer};
1259    use arrow_schema::{Field, Fields};
1260    use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1261
1262    #[test]
1263    fn invalid_not_a_struct_array() {
1264        let array = make_binary_view_array();
1265        // Should fail because the input is not a StructArray
1266        let err = VariantArray::try_new(&array);
1267        assert_eq!(
1268            err.unwrap_err().to_string(),
1269            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1270        );
1271    }
1272
1273    #[test]
1274    fn invalid_missing_metadata() {
1275        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1276        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1277        // Should fail because the StructArray does not contain a 'metadata' field
1278        let err = VariantArray::try_new(&array);
1279        assert_eq!(
1280            err.unwrap_err().to_string(),
1281            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1282        );
1283    }
1284
1285    #[test]
1286    fn all_null_missing_value_and_typed_value() {
1287        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1288        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1289
1290        // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
1291        // should be invalid, but we currently allow it and treat it as Variant::Null.
1292        // This is a pragmatic decision to handle missing data gracefully.
1293        let variant_array = VariantArray::try_new(&array).unwrap();
1294
1295        // Verify the shredding state is AllNull
1296        assert!(matches!(
1297            variant_array.shredding_state(),
1298            ShreddingState {
1299                value: None,
1300                typed_value: None
1301            }
1302        ));
1303
1304        // Verify that value() returns Variant::Null (compensating for spec violation)
1305        for i in 0..variant_array.len() {
1306            if variant_array.is_valid(i) {
1307                assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1308            }
1309        }
1310    }
1311
1312    #[test]
1313    fn invalid_metadata_field_type() {
1314        let fields = Fields::from(vec![
1315            Field::new("metadata", DataType::Int32, true), // not supported
1316            Field::new("value", DataType::BinaryView, true),
1317        ]);
1318        let array = StructArray::new(
1319            fields,
1320            vec![make_int32_array(), make_binary_view_array()],
1321            None,
1322        );
1323        let err = VariantArray::try_new(&array);
1324        assert_eq!(
1325            err.unwrap_err().to_string(),
1326            "Invalid argument error: VariantArray 'metadata' field must be Binary, LargeBinary, or BinaryView, got Int32"
1327        );
1328    }
1329
1330    #[test]
1331    fn invalid_value_field_type() {
1332        let fields = Fields::from(vec![
1333            Field::new("metadata", DataType::BinaryView, true),
1334            Field::new("value", DataType::Int32, true),
1335        ]);
1336        let array = StructArray::new(
1337            fields,
1338            vec![make_binary_view_array(), make_int32_array()],
1339            None,
1340        );
1341        let err = VariantArray::try_new(&array);
1342        assert_eq!(
1343            err.unwrap_err().to_string(),
1344            "Invalid argument error: VariantArray 'value' field must be Binary, LargeBinary, or BinaryView, got Int32"
1345        );
1346    }
1347
1348    fn make_binary_view_array() -> ArrayRef {
1349        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1350    }
1351
1352    fn make_int32_array() -> ArrayRef {
1353        Arc::new(Int32Array::from(vec![1]))
1354    }
1355
1356    fn make_variant_struct_with_typed_value(typed_value: ArrayRef) -> StructArray {
1357        let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1358            EMPTY_VARIANT_METADATA_BYTES,
1359            typed_value.len(),
1360        ));
1361        StructArrayBuilder::new()
1362            .with_field("metadata", Arc::new(metadata), false)
1363            .with_field("typed_value", typed_value, true)
1364            .build()
1365    }
1366
1367    #[test]
1368    fn try_new_tags_untagged_uuid_on_read() {
1369        // Simulate a foreign writer that shredded a UUID column as bare FixedSizeBinary(16),
1370        // omitting the UUID extension type.
1371        let typed_value = FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 2));
1372        let input = make_variant_struct_with_typed_value(Arc::new(typed_value.unwrap()));
1373
1374        // try_new canonicalizes on the read path and attaches the extension.
1375        let variant_array = VariantArray::try_new(&input).unwrap();
1376        let typed_value = variant_array.inner().field_by_name("typed_value").unwrap();
1377        assert_eq!(typed_value.data_type(), &DataType::FixedSizeBinary(16));
1378        assert!(typed_value.has_valid_extension_type::<UuidExtension>());
1379    }
1380
1381    #[test]
1382    fn try_new_tags_untagged_nested_uuid_on_read() {
1383        // A shredded object { id: { typed_value: FixedSizeBinary(16) } } whose inner UUID leaf
1384        // carries no extension type; canonicalization must reach it recursively.
1385        let leaf = FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 1)).unwrap();
1386        let inner = StructArrayBuilder::new()
1387            .with_field("typed_value", Arc::new(leaf), true)
1388            .build();
1389        let object = StructArrayBuilder::new()
1390            .with_field("id", Arc::new(inner), false)
1391            .build();
1392        let input = make_variant_struct_with_typed_value(Arc::new(object));
1393
1394        // typed_value (struct) -> id (struct) -> typed_value (the FixedSizeBinary(16) UUID leaf).
1395        let variant_array = VariantArray::try_new(&input).unwrap();
1396        let object = variant_array.typed_value_field().unwrap().as_struct();
1397        let id = object.column_by_name("id").unwrap().as_struct();
1398        let uuid_leaf = id.field_by_name("typed_value").unwrap();
1399        assert!(uuid_leaf.has_valid_extension_type::<UuidExtension>());
1400    }
1401
1402    #[test]
1403    fn all_null_shredding_state() {
1404        // Verify the shredding state is AllNull
1405        assert!(matches!(
1406            ShreddingState::new(None, None),
1407            ShreddingState {
1408                value: None,
1409                typed_value: None
1410            }
1411        ));
1412    }
1413
1414    #[test]
1415    fn all_null_variant_array_construction() {
1416        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1417        let nulls = NullBuffer::from(vec![false, false, false]); // all null
1418
1419        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1420        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1421
1422        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1423
1424        // Verify the shredding state is AllNull
1425        assert!(matches!(
1426            variant_array.shredding_state(),
1427            ShreddingState {
1428                value: None,
1429                typed_value: None
1430            }
1431        ));
1432
1433        // Verify all values are null
1434        assert_eq!(variant_array.len(), 3);
1435        assert!(!variant_array.is_valid(0));
1436        assert!(!variant_array.is_valid(1));
1437        assert!(!variant_array.is_valid(2));
1438
1439        // Verify that value() returns Variant::Null for all indices
1440        for i in 0..variant_array.len() {
1441            assert!(
1442                !variant_array.is_valid(i),
1443                "Expected value at index {i} to be null"
1444            );
1445        }
1446    }
1447
1448    #[test]
1449    fn value_field_present_but_all_null_should_be_unshredded() {
1450        // This test demonstrates the issue: when a value field exists in schema
1451        // but all its values are null, it should remain Unshredded, not AllNull
1452        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1453
1454        // Create a value field with all null values
1455        let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
1456        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1457        let value_data = value_array
1458            .to_data()
1459            .into_builder()
1460            .nulls(Some(value_nulls))
1461            .build()
1462            .unwrap();
1463        let value = BinaryViewArray::from(value_data);
1464
1465        let fields = Fields::from(vec![
1466            Field::new("metadata", DataType::BinaryView, false),
1467            Field::new("value", DataType::BinaryView, true), // Field exists in schema
1468        ]);
1469        let struct_array = StructArray::new(
1470            fields,
1471            vec![Arc::new(metadata), Arc::new(value)],
1472            None, // struct itself is not null, just the value field is all null
1473        );
1474
1475        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1476
1477        // This should be Unshredded, not AllNull, because value field exists in schema
1478        assert!(matches!(
1479            variant_array.shredding_state(),
1480            ShreddingState {
1481                value: Some(_),
1482                typed_value: None
1483            }
1484        ));
1485    }
1486
1487    #[test]
1488    fn canonicalize_and_verify_list_like_data_types() {
1489        // `parquet/tests/variant_integration.rs` validates Parquet shredded-variant fixtures that
1490        // use Parquet LIST encoding, but those fixtures do not cover Arrow-specific list container
1491        // variants (`LargeList`, `ListView`, `LargeListView`) accepted by `VariantArray::try_new`.
1492        let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true));
1493        let make_large_binary = || Arc::new(Field::new("item", DataType::LargeBinary, true));
1494        let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true));
1495
1496        let cases = vec![
1497            // Binary item
1498            DataType::LargeList(make_item_binary()),
1499            DataType::ListView(make_item_binary()),
1500            DataType::LargeListView(make_item_binary()),
1501            // Large binary item
1502            DataType::LargeList(make_large_binary()),
1503            DataType::ListView(make_large_binary()),
1504            DataType::LargeListView(make_large_binary()),
1505            // Binary view item
1506            DataType::LargeList(make_item_binary_view()),
1507            DataType::ListView(make_item_binary_view()),
1508            DataType::LargeListView(make_item_binary_view()),
1509        ];
1510
1511        for input in cases {
1512            assert_eq!(
1513                canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
1514                &input
1515            );
1516        }
1517    }
1518
1519    #[test]
1520    fn variant_array_try_new_supports_list_like_typed_value() {
1521        let item_field = Arc::new(Field::new("item", DataType::Int64, true));
1522        let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
1523
1524        let typed_values = vec![
1525            Arc::new(ListArray::new(
1526                item_field.clone(),
1527                OffsetBuffer::new(ScalarBuffer::from(vec![0, 2, 3])),
1528                values.clone(),
1529                None,
1530            )) as ArrayRef,
1531            Arc::new(LargeListArray::new(
1532                item_field.clone(),
1533                OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 3])),
1534                values.clone(),
1535                None,
1536            )) as ArrayRef,
1537            Arc::new(ListViewArray::new(
1538                item_field.clone(),
1539                ScalarBuffer::from(vec![0, 2]),
1540                ScalarBuffer::from(vec![2, 1]),
1541                values.clone(),
1542                None,
1543            )) as ArrayRef,
1544            Arc::new(LargeListViewArray::new(
1545                item_field,
1546                ScalarBuffer::from(vec![0_i64, 2]),
1547                ScalarBuffer::from(vec![2_i64, 1]),
1548                values,
1549                None,
1550            )) as ArrayRef,
1551        ];
1552
1553        for typed_value in typed_values {
1554            let input = make_variant_struct_with_typed_value(typed_value.clone());
1555            let variant_array = VariantArray::try_new(&input).unwrap();
1556            assert_eq!(
1557                variant_array.typed_value_field().unwrap().data_type(),
1558                typed_value.data_type(),
1559            );
1560        }
1561    }
1562
1563    #[test]
1564    fn test_variant_array_iterable() {
1565        let mut b = VariantArrayBuilder::new(6);
1566
1567        b.append_null();
1568        b.append_variant(Variant::from(1_i8));
1569        b.append_variant(Variant::Null);
1570        b.append_variant(Variant::from(2_i32));
1571        b.append_variant(Variant::from(3_i64));
1572        b.append_null();
1573
1574        let v = b.build();
1575
1576        let variants = v.iter().collect::<Vec<_>>();
1577
1578        assert_eq!(
1579            variants,
1580            vec![
1581                None,
1582                Some(Variant::Int8(1)),
1583                Some(Variant::Null),
1584                Some(Variant::Int32(2)),
1585                Some(Variant::Int64(3)),
1586                None,
1587            ]
1588        );
1589    }
1590
1591    #[test]
1592    fn test_variant_array_iter_double_ended() {
1593        let mut b = VariantArrayBuilder::new(5);
1594
1595        b.append_variant(Variant::from(0_i32));
1596        b.append_null();
1597        b.append_variant(Variant::from(2_i32));
1598        b.append_null();
1599        b.append_variant(Variant::from(4_i32));
1600
1601        let array = b.build();
1602        let mut iter = array.iter();
1603
1604        assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1605        assert_eq!(iter.next(), Some(None));
1606
1607        assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1608        assert_eq!(iter.next_back(), Some(None));
1609        assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1610
1611        assert_eq!(iter.next_back(), None);
1612        assert_eq!(iter.next(), None);
1613    }
1614
1615    #[test]
1616    fn test_variant_array_iter_reverse() {
1617        let mut b = VariantArrayBuilder::new(5);
1618
1619        b.append_variant(Variant::from("a"));
1620        b.append_null();
1621        b.append_variant(Variant::from("aaa"));
1622        b.append_null();
1623        b.append_variant(Variant::from("aaaaa"));
1624
1625        let array = b.build();
1626
1627        let result: Vec<_> = array.iter().rev().collect();
1628        assert_eq!(
1629            result,
1630            vec![
1631                Some(Variant::from("aaaaa")),
1632                None,
1633                Some(Variant::from("aaa")),
1634                None,
1635                Some(Variant::from("a")),
1636            ]
1637        );
1638    }
1639
1640    #[test]
1641    fn test_variant_array_iter_empty() {
1642        let v = VariantArrayBuilder::new(0).build();
1643        let mut i = v.iter();
1644        assert!(i.next().is_none());
1645        assert!(i.next_back().is_none());
1646    }
1647
1648    #[test]
1649    fn test_from_variant_opts_into_variant_array() {
1650        let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1651
1652        let variant_array = VariantArray::from_iter(v);
1653
1654        assert_eq!(variant_array.len(), 4);
1655
1656        assert!(variant_array.is_null(0));
1657
1658        assert!(!variant_array.is_null(1));
1659        assert_eq!(variant_array.value(1), Variant::Null);
1660
1661        assert!(!variant_array.is_null(2));
1662        assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1663
1664        assert!(variant_array.is_null(3));
1665    }
1666
1667    #[test]
1668    fn test_from_variants_into_variant_array() {
1669        let v = vec![
1670            Variant::Null,
1671            Variant::BooleanFalse,
1672            Variant::ShortString(ShortString::try_new("norm").unwrap()),
1673        ];
1674
1675        let variant_array = VariantArray::from_iter(v);
1676
1677        assert_eq!(variant_array.len(), 3);
1678
1679        assert!(!variant_array.is_null(0));
1680        assert_eq!(variant_array.value(0), Variant::Null);
1681
1682        assert!(!variant_array.is_null(1));
1683        assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1684
1685        assert!(!variant_array.is_null(2));
1686        assert_eq!(
1687            variant_array.value(2),
1688            Variant::ShortString(ShortString::try_new("norm").unwrap())
1689        );
1690    }
1691
1692    #[test]
1693    fn test_variant_equality() {
1694        let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1695        let v = VariantArray::from_iter(v_iter.clone());
1696
1697        {
1698            let v_copy = v.clone();
1699            assert_eq!(v, v_copy);
1700        }
1701
1702        {
1703            let v_iter_reversed = v_iter.iter().cloned().rev();
1704            let v_reversed = VariantArray::from_iter(v_iter_reversed);
1705
1706            assert_ne!(v, v_reversed);
1707        }
1708
1709        {
1710            let v_sliced = v.slice(0, 1);
1711            assert_ne!(v, v_sliced);
1712        }
1713    }
1714
1715    #[test]
1716    fn binary_typed_value_roundtrips() {
1717        // Verify that a shredded variant with Binary typed_value can be read back
1718        let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1719            EMPTY_VARIANT_METADATA_BYTES,
1720        ]));
1721        let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello" as &[u8]]));
1722
1723        let struct_array = StructArrayBuilder::new()
1724            .with_field("metadata", metadata, false)
1725            .with_field("typed_value", typed_value, true)
1726            .build();
1727
1728        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1729        assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8]));
1730    }
1731
1732    #[test]
1733    fn large_binary_typed_value_roundtrips() {
1734        // Verify that a shredded variant with LargeBinary typed_value can be read back
1735        let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1736            EMPTY_VARIANT_METADATA_BYTES,
1737        ]));
1738        let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]]));
1739
1740        let struct_array = StructArrayBuilder::new()
1741            .with_field("metadata", metadata, false)
1742            .with_field("typed_value", typed_value, true)
1743            .build();
1744
1745        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1746        assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8]));
1747    }
1748
1749    macro_rules! invalid_variant_array_test {
1750        ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1751            #[test]
1752            fn $fn_name() {
1753                let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1754                    EMPTY_VARIANT_METADATA_BYTES,
1755                    1,
1756                ));
1757                let invalid_typed_value = $invalid_typed_value;
1758
1759                let struct_array = StructArrayBuilder::new()
1760                    .with_field("metadata", Arc::new(metadata), false)
1761                    .with_field("typed_value", Arc::new(invalid_typed_value), true)
1762                    .build();
1763
1764                let array: VariantArray = VariantArray::try_new(&struct_array)
1765                    .expect("should create variant array")
1766                    .into();
1767
1768                let result = array.try_value(0);
1769                assert!(result.is_err());
1770                let error = result.unwrap_err();
1771                assert!(matches!(error, ArrowError::CastError(_)));
1772
1773                let expected: &str = $error_msg;
1774                assert!(
1775                    error.to_string().contains($error_msg),
1776                    "error `{}` did not contain `{}`",
1777                    error,
1778                    expected
1779                )
1780            }
1781        };
1782    }
1783
1784    invalid_variant_array_test!(
1785        test_variant_array_invalide_time,
1786        Time64MicrosecondArray::from(vec![Some(86401000000)]),
1787        "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1788    );
1789
1790    invalid_variant_array_test!(
1791        test_variant_array_invalid_decimal32,
1792        Decimal32Array::from(vec![Some(1234567890)]),
1793        "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1794    );
1795
1796    invalid_variant_array_test!(
1797        test_variant_array_invalid_decimal64,
1798        Decimal64Array::from(vec![Some(1234567890123456789)]),
1799        "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1800    );
1801
1802    invalid_variant_array_test!(
1803        test_variant_array_invalid_decimal128,
1804        Decimal128Array::from(vec![Some(
1805            i128::from_str("123456789012345678901234567890123456789").unwrap()
1806        ),]),
1807        "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1808    );
1809}