Skip to main content

parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22    generic_conversion_single_value, generic_conversion_single_value_with_result,
23    primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29    Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31    TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::ExtensionType;
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38    Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44/// Returns the raw bytes at the given index from a binary-like array, return `None` if the array isn't binary-like.
45pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8]> {
46    match array.data_type() {
47        DataType::Binary => Some(array.as_binary::<i32>().value(index)),
48        DataType::LargeBinary => Some(array.as_binary::<i64>().value(index)),
49        DataType::BinaryView => Some(array.as_binary_view().value(index)),
50        _ => None,
51    }
52}
53
54/// Returns a [`Variant`] from a `metadata` and `value` byte arrays, returns `None`
55/// if one of them is of invalid type.
56pub(crate) fn variant_from_arrays_at<'m, 'v>(
57    metadata: &'m dyn Array,
58    value: &'v dyn Array,
59    index: usize,
60) -> Option<Variant<'m, 'v>> {
61    let metadata = binary_array_value(metadata, index)?;
62    let value = binary_array_value(value, index)?;
63    Some(Variant::new(metadata, value))
64}
65
66/// Validates that an array has a binary-like data type.
67fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
68    match array.data_type() {
69        DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Ok(()),
70        _ => Err(ArrowError::InvalidArgumentError(format!(
71            "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}",
72            array.data_type()
73        ))),
74    }
75}
76
77/// Arrow Variant [`ExtensionType`].
78///
79/// Represents the canonical Arrow Extension Type for storing variants.
80/// See [`VariantArray`] for more examples of using this extension type.
81pub struct VariantType;
82
83impl ExtensionType for VariantType {
84    const NAME: &'static str = "arrow.parquet.variant";
85
86    // Variants extension metadata is an empty string
87    // <https://github.com/apache/arrow/blob/d803afcc43f5d132506318fd9e162d33b2c3d4cd/docs/source/format/CanonicalExtensions.rst?plain=1#L473>
88    type Metadata = &'static str;
89
90    fn metadata(&self) -> &Self::Metadata {
91        &""
92    }
93
94    fn serialize_metadata(&self) -> Option<String> {
95        Some(String::new())
96    }
97
98    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
99        Ok("")
100    }
101
102    fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
103        if matches!(data_type, DataType::Struct(_)) {
104            Ok(())
105        } else {
106            Err(ArrowError::InvalidArgumentError(format!(
107                "VariantType only supports StructArray, got {data_type}"
108            )))
109        }
110    }
111
112    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
113        Self.supports_data_type(data_type)?;
114        Ok(Self)
115    }
116
117    fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<()> {
118        Self.supports_data_type(data_type)
119    }
120}
121
122/// An array of Parquet [`Variant`] values
123///
124/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
125/// `metadata` and `value` fields, and adds convenience methods to access
126/// the [`Variant`]s.
127///
128/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row.
129///
130/// See the examples below from converting between `VariantArray` and
131/// `StructArray`.
132///
133/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
134///
135/// # Documentation
136///
137/// Variant is documented as a canonical Arrow extension type in the
138/// [Parquet Variant] section of the [official list of extension types] on
139/// the Apache Arrow website.
140///
141/// [Parquet Variant]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#parquet-variant
142/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
143///
144/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension
145///
146/// Arrow Arrays only provide [`DataType`], but the extension type information
147/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or
148/// [`Field`] to check for the extension type.
149///
150/// [`Schema`]: arrow_schema::Schema
151/// ```
152/// # use arrow::array::StructArray;
153/// # use arrow_schema::{Schema, Field, DataType};
154/// # use parquet_variant::Variant;
155/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
156/// # fn get_variant_array() -> VariantArray {
157/// #   let mut builder = VariantArrayBuilder::new(10);
158/// #   builder.append_variant(Variant::from("such wow"));
159/// #   builder.build()
160/// # }
161/// # fn get_schema() -> Schema {
162/// #   Schema::new(vec![
163/// #     Field::new("id", DataType::Int32, false),
164/// #     get_variant_array().field("var"),
165/// #   ])
166/// # }
167/// let schema = get_schema();
168/// assert_eq!(schema.fields().len(), 2);
169/// // first field is not a Variant
170/// assert!(!schema.field(0).has_valid_extension_type::<VariantType>());
171/// // second field is a Variant
172/// assert!(schema.field(1).has_valid_extension_type::<VariantType>());
173/// ```
174///
175/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
176///
177/// You can construct the correct [`Field`] for a [`VariantArray`] using the
178/// [`VariantArray::field`] method.
179///
180/// ```
181/// # use arrow_schema::{Schema, Field, DataType};
182/// # use parquet_variant::Variant;
183/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
184/// # fn get_variant_array() -> VariantArray {
185/// #   let mut builder = VariantArrayBuilder::new(10);
186/// #   builder.append_variant(Variant::from("such wow"));
187/// #   builder.build()
188/// # }
189/// let variant_array = get_variant_array();
190/// // First field is an integer id, second field is a variant
191/// let schema = Schema::new(vec![
192///   Field::new("id", DataType::Int32, false),
193///   // call VariantArray::field to get the correct Field
194///   variant_array.field("var"),
195/// ]);
196/// ```
197///
198/// You can also construct the [`Field`] using [`VariantType`] directly
199///
200/// ```
201/// # use arrow_schema::{Schema, Field, DataType};
202/// # use parquet_variant::Variant;
203/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
204/// # fn get_variant_array() -> VariantArray {
205/// #   let mut builder = VariantArrayBuilder::new(10);
206/// #   builder.append_variant(Variant::from("such wow"));
207/// #   builder.build()
208/// # }
209/// # let variant_array = get_variant_array();
210/// // The DataType of a VariantArray varies depending on how it is shredded
211/// let data_type = variant_array.data_type().clone();
212/// // First field is an integer id, second field is a variant
213/// let schema = Schema::new(vec![
214///   Field::new("id", DataType::Int32, false),
215///   Field::new("var", data_type, false)
216///     // Add extension metadata to the field using `VariantType`
217///     .with_extension_type(VariantType),
218/// ]);
219/// ```
220///
221/// # Example: Converting a [`VariantArray`] to a [`StructArray`]
222///
223/// ```
224/// # use arrow::array::StructArray;
225/// # use parquet_variant::Variant;
226/// # use parquet_variant_compute::VariantArrayBuilder;
227/// // Create Variant Array
228/// let mut builder = VariantArrayBuilder::new(10);
229/// builder.append_variant(Variant::from("such wow"));
230/// let variant_array = builder.build();
231/// // convert to StructArray
232/// let struct_array: StructArray = variant_array.into();
233/// ```
234///
235/// # Example: Converting a [`StructArray`] to a [`VariantArray`]
236///
237/// ```
238/// # use arrow::array::StructArray;
239/// # use parquet_variant::Variant;
240/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray};
241/// # fn get_struct_array() -> StructArray {
242/// #   let mut builder = VariantArrayBuilder::new(10);
243/// #   builder.append_variant(Variant::from("such wow"));
244/// #   builder.build().into()
245/// # }
246/// let struct_array: StructArray = get_struct_array();
247/// // try and create a VariantArray from it
248/// let variant_array = VariantArray::try_new(&struct_array).unwrap();
249/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
250/// ```
251///
252#[derive(Debug, Clone)]
253pub struct VariantArray {
254    /// Reference to the underlying StructArray
255    inner: StructArray,
256
257    /// The metadata column of this variant (Binary, LargeBinary, or BinaryView)
258    metadata: ArrayRef,
259
260    /// how is this variant array shredded?
261    shredding_state: ShreddingState,
262}
263
264impl VariantArray {
265    /// Creates a new `VariantArray` from a [`StructArray`].
266    ///
267    /// # Arguments
268    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
269    ///
270    /// # Returns
271    /// - A new instance of `VariantArray`.
272    ///
273    /// # Errors:
274    /// - If the `StructArray` does not contain the required fields
275    ///
276    /// # Requirements of the `StructArray`
277    ///
278    /// 1. A required field named `metadata` which is binary, large_binary, or
279    ///    binary_view
280    ///
281    /// 2. An optional field named `value` that is binary, large_binary, or
282    ///    binary_view
283    ///
284    /// 3. An optional field named `typed_value` which can be any primitive type
285    ///    or be a list, large_list, list_view or struct
286    ///
287    /// NOTE: It is also permissible for the metadata field to be
288    /// Dictionary-Encoded, preferably (but not required) with an index type of
289    /// int8.
290    ///
291    pub fn try_new(inner: &dyn Array) -> Result<Self> {
292        // Canonicalize shredded typed_value fields (e.g. decimal narrowing)
293        let inner = canonicalize_shredded_types(inner)?;
294
295        let Some(inner) = inner.as_struct_opt() else {
296            return Err(ArrowError::InvalidArgumentError(
297                "Invalid VariantArray: requires StructArray as input".to_string(),
298            ));
299        };
300
301        // Note the specification allows for any order so we must search by name
302
303        // Ensure the StructArray has a metadata field that is a binary type
304        let Some(metadata_col) = inner.column_by_name("metadata") else {
305            return Err(ArrowError::InvalidArgumentError(
306                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
307            ));
308        };
309        validate_binary_array(metadata_col.as_ref(), "metadata")?;
310
311        // Note these clones are cheap, they just bump the ref count
312        Ok(Self {
313            inner: inner.clone(),
314            metadata: metadata_col.clone(),
315            shredding_state: ShreddingState::try_from(inner)?,
316        })
317    }
318
319    pub(crate) fn from_parts(
320        metadata: ArrayRef,
321        value: Option<ArrayRef>,
322        typed_value: Option<ArrayRef>,
323        nulls: Option<NullBuffer>,
324    ) -> Self {
325        let mut builder = StructArrayBuilder::new().with_field("metadata", metadata.clone(), false);
326        if let Some(value) = value.clone() {
327            builder = builder.with_field("value", value, true);
328        }
329        if let Some(typed_value) = typed_value.clone() {
330            builder = builder.with_field("typed_value", typed_value, true);
331        }
332        if let Some(nulls) = nulls {
333            builder = builder.with_nulls(nulls);
334        }
335
336        Self {
337            inner: builder.build(),
338            metadata,
339            shredding_state: ShreddingState::new(value, typed_value),
340        }
341    }
342
343    /// Returns a reference to the underlying [`StructArray`].
344    pub fn inner(&self) -> &StructArray {
345        &self.inner
346    }
347
348    /// Returns the inner [`StructArray`], consuming self
349    pub fn into_inner(self) -> StructArray {
350        self.inner
351    }
352
353    /// Return the shredding state of this `VariantArray`
354    pub fn shredding_state(&self) -> &ShreddingState {
355        &self.shredding_state
356    }
357
358    /// Return the [`Variant`] instance stored at the given row
359    ///
360    /// This is a convenience wrapper that calls [`VariantArray::try_value`] and unwraps the `Result`.
361    /// Use `try_value` if you need to handle conversion errors gracefully.
362    ///
363    /// # Panics
364    /// * if the index is out of bounds
365    /// * if the array value is null
366    /// * if `try_value` returns an error.
367    pub fn value(&self, index: usize) -> Variant<'_, '_> {
368        self.try_value(index).unwrap()
369    }
370
371    /// Return the [`Variant`] instance stored at the given row
372    ///
373    /// Note: This method does not check for nulls and the value is arbitrary
374    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
375    ///
376    /// # Panics
377    ///
378    /// Panics if
379    /// * the index is out of bounds
380    /// * the array value is null
381    ///
382    /// # Errors
383    ///
384    /// Errors if
385    /// - the data in `typed_value` cannot be interpreted as a valid `Variant`
386    ///
387    /// If this is a shredded variant but has no value at the shredded location, it
388    /// will return [`Variant::Null`].
389    ///
390    ///
391    /// # Performance Note
392    ///
393    /// This is certainly not the most efficient way to access values in a
394    /// `VariantArray`, but it is useful for testing and debugging.
395    ///
396    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
397    /// caller to ensure that the metadata and value were constructed correctly.
398    pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
399        match (self.typed_value_field(), self.value_field()) {
400            // Always prefer typed_value, if available
401            (Some(typed_value), value) if typed_value.is_valid(index) => {
402                typed_value_to_variant(typed_value, value, index)
403            }
404            // Otherwise fall back to value, if available
405            (_, Some(value)) if value.is_valid(index) => variant_from_arrays_at(
406                &self.metadata,
407                value,
408                index,
409            )
410            .ok_or_else(|| {
411                ArrowError::InvalidArgumentError(format!(
412                    "metadata and value fields must be binary-like arrays, instead got {} and {}",
413                    self.metadata.data_type(),
414                    value.data_type()
415                ))
416            }),
417            // It is technically invalid for neither value nor typed_value fields to be available,
418            // but the spec specifically requires readers to return Variant::Null in this case.
419            _ => Ok(Variant::Null),
420        }
421    }
422
423    /// Return a reference to the metadata field of the [`StructArray`]
424    pub fn metadata_field(&self) -> &ArrayRef {
425        &self.metadata
426    }
427
428    /// Return a reference to the value field of the `StructArray`
429    pub fn value_field(&self) -> Option<&ArrayRef> {
430        self.shredding_state.value_field()
431    }
432
433    /// Return a reference to the typed_value field of the `StructArray`, if present
434    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
435        self.shredding_state.typed_value_field()
436    }
437
438    /// Return a field to represent this VariantArray in a `Schema` with
439    /// a particular name
440    pub fn field(&self, name: impl Into<String>) -> Field {
441        Field::new(
442            name.into(),
443            self.data_type().clone(),
444            self.inner.is_nullable(),
445        )
446        .with_extension_type(VariantType)
447    }
448
449    /// Returns a new DataType representing this VariantArray's inner type
450    pub fn data_type(&self) -> &DataType {
451        self.inner.data_type()
452    }
453
454    pub fn slice(&self, offset: usize, length: usize) -> Self {
455        let inner = self.inner.slice(offset, length);
456        let metadata = self.metadata.slice(offset, length);
457        let shredding_state = self.shredding_state.slice(offset, length);
458        Self {
459            inner,
460            metadata,
461            shredding_state,
462        }
463    }
464
465    pub fn len(&self) -> usize {
466        self.inner.len()
467    }
468
469    pub fn is_empty(&self) -> bool {
470        self.inner.is_empty()
471    }
472
473    pub fn nulls(&self) -> Option<&NullBuffer> {
474        self.inner.nulls()
475    }
476
477    /// Is the element at index null?
478    pub fn is_null(&self, index: usize) -> bool {
479        self.nulls().is_some_and(|n| n.is_null(index))
480    }
481
482    /// Is the element at index valid (not null)?
483    pub fn is_valid(&self, index: usize) -> bool {
484        !self.is_null(index)
485    }
486
487    /// Returns an iterator over the values in this array
488    pub fn iter(&self) -> VariantArrayIter<'_> {
489        VariantArrayIter::new(self)
490    }
491}
492
493impl PartialEq for VariantArray {
494    fn eq(&self, other: &Self) -> bool {
495        self.inner == other.inner
496    }
497}
498
499impl From<VariantArray> for StructArray {
500    fn from(variant_array: VariantArray) -> Self {
501        variant_array.into_inner()
502    }
503}
504
505impl From<VariantArray> for ArrayRef {
506    fn from(variant_array: VariantArray) -> Self {
507        Arc::new(variant_array.into_inner())
508    }
509}
510
511impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
512    fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
513        let iter = iter.into_iter();
514
515        let mut b = VariantArrayBuilder::new(iter.size_hint().0);
516        b.extend(iter);
517        b.build()
518    }
519}
520
521impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
522    fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
523        Self::from_iter(iter.into_iter().map(Some))
524    }
525}
526
527/// An iterator over [`VariantArray`]
528///
529/// This iterator returns `Option<Option<Variant<'a, 'a>>>` where:
530/// - `None` indicates the end of iteration
531/// - `Some(None)` indicates a null value at this position
532/// - `Some(Some(variant))` indicates a valid variant value
533///
534/// # Example
535///
536/// ```
537/// # use parquet_variant::Variant;
538/// # use parquet_variant_compute::VariantArrayBuilder;
539/// let mut builder = VariantArrayBuilder::new(10);
540/// builder.append_variant(Variant::from(42));
541/// builder.append_null();
542/// builder.append_variant(Variant::from("hello"));
543/// let array = builder.build();
544///
545/// let values = array.iter().collect::<Vec<_>>();
546/// assert_eq!(values.len(), 3);
547/// assert_eq!(values[0], Some(Variant::from(42)));
548/// assert_eq!(values[1], None);
549/// assert_eq!(values[2], Some(Variant::from("hello")));
550/// ```
551#[derive(Debug)]
552pub struct VariantArrayIter<'a> {
553    array: &'a VariantArray,
554    head_i: usize,
555    tail_i: usize,
556}
557
558impl<'a> VariantArrayIter<'a> {
559    /// Creates a new iterator over the given [`VariantArray`]
560    pub fn new(array: &'a VariantArray) -> Self {
561        Self {
562            array,
563            head_i: 0,
564            tail_i: array.len(),
565        }
566    }
567
568    fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
569        self.array.is_valid(i).then(|| self.array.value(i))
570    }
571}
572
573impl<'a> Iterator for VariantArrayIter<'a> {
574    type Item = Option<Variant<'a, 'a>>;
575
576    #[inline]
577    fn next(&mut self) -> Option<Self::Item> {
578        if self.head_i == self.tail_i {
579            return None;
580        }
581
582        let out = self.value_opt(self.head_i);
583
584        self.head_i += 1;
585
586        Some(out)
587    }
588
589    fn size_hint(&self) -> (usize, Option<usize>) {
590        let remainder = self.tail_i - self.head_i;
591
592        (remainder, Some(remainder))
593    }
594}
595
596impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
597    fn next_back(&mut self) -> Option<Self::Item> {
598        if self.head_i == self.tail_i {
599            return None;
600        }
601
602        self.tail_i -= 1;
603
604        Some(self.value_opt(self.tail_i))
605    }
606}
607
608impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
609
610/// One shredded field of a partially or perfectly shredded variant. For example, suppose the
611/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
612/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
613/// is:
614///
615/// ```text
616/// v: VARIANT {
617///     metadata: BINARY,
618///     value: BINARY,
619///     typed_value: STRUCT {
620///         a: SHREDDED_VARIANT_FIELD {
621///             value: BINARY,
622///             typed_value: STRUCT {
623///                 a: SHREDDED_VARIANT_FIELD {
624///                     value: BINARY,
625///                     typed_value: INT,
626///                 },
627///             },
628///         },
629///     },
630/// }
631/// ```
632///
633/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
634/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
635/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
636/// single expected field `a`).
637///
638/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
639/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
640/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
641///
642/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
643/// variant value (which could be `Variant::Null`).
644#[derive(Debug)]
645pub struct ShreddedVariantFieldArray {
646    /// Reference to the underlying StructArray
647    inner: StructArray,
648    shredding_state: ShreddingState,
649}
650
651#[allow(unused)]
652impl ShreddedVariantFieldArray {
653    /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
654    ///
655    /// # Arguments
656    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
657    ///
658    /// # Returns
659    /// - A new instance of `ShreddedVariantFieldArray`.
660    ///
661    /// # Errors:
662    /// - If the `StructArray` does not contain the required fields
663    ///
664    /// # Requirements of the `StructArray`
665    ///
666    /// 1. An optional field named `value` that is binary, large_binary, or
667    ///    binary_view
668    ///
669    /// 2. An optional field named `typed_value` which can be any primitive type
670    ///    or be a list, large_list, list_view or struct
671    ///
672    pub fn try_new(inner: &dyn Array) -> Result<Self> {
673        let Some(inner_struct) = inner.as_struct_opt() else {
674            return Err(ArrowError::InvalidArgumentError(
675                "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
676            ));
677        };
678
679        // Note this clone is cheap, it just bumps the ref count
680        Ok(Self {
681            inner: inner_struct.clone(),
682            shredding_state: ShreddingState::try_from(inner_struct)?,
683        })
684    }
685
686    /// Return the shredding state of this `VariantArray`
687    pub fn shredding_state(&self) -> &ShreddingState {
688        &self.shredding_state
689    }
690
691    /// Return a reference to the value field of the `StructArray`
692    pub fn value_field(&self) -> Option<&ArrayRef> {
693        self.shredding_state.value_field()
694    }
695
696    /// Return a reference to the typed_value field of the `StructArray`, if present
697    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
698        self.shredding_state.typed_value_field()
699    }
700
701    /// Returns a reference to the underlying [`StructArray`].
702    pub fn inner(&self) -> &StructArray {
703        &self.inner
704    }
705
706    pub(crate) fn from_parts(
707        value: Option<ArrayRef>,
708        typed_value: Option<ArrayRef>,
709        nulls: Option<NullBuffer>,
710    ) -> Self {
711        let mut builder = StructArrayBuilder::new();
712        if let Some(value) = value.clone() {
713            builder = builder.with_field("value", value, true);
714        }
715        if let Some(typed_value) = typed_value.clone() {
716            builder = builder.with_field("typed_value", typed_value, true);
717        }
718        if let Some(nulls) = nulls {
719            builder = builder.with_nulls(nulls);
720        }
721
722        Self {
723            inner: builder.build(),
724            shredding_state: ShreddingState::new(value, typed_value),
725        }
726    }
727
728    /// Returns the inner [`StructArray`], consuming self
729    pub fn into_inner(self) -> StructArray {
730        self.inner
731    }
732
733    pub fn data_type(&self) -> &DataType {
734        self.inner.data_type()
735    }
736
737    pub fn len(&self) -> usize {
738        self.inner.len()
739    }
740
741    pub fn is_empty(&self) -> bool {
742        self.inner.is_empty()
743    }
744
745    pub fn offset(&self) -> usize {
746        self.inner.offset()
747    }
748
749    pub fn nulls(&self) -> Option<&NullBuffer> {
750        // According to the shredding spec, ShreddedVariantFieldArray should be
751        // physically non-nullable - SQL NULL is inferred by both value and
752        // typed_value being physically NULL
753        None
754    }
755    /// Is the element at index null?
756    pub fn is_null(&self, index: usize) -> bool {
757        self.nulls().is_some_and(|n| n.is_null(index))
758    }
759
760    /// Is the element at index valid (not null)?
761    pub fn is_valid(&self, index: usize) -> bool {
762        !self.is_null(index)
763    }
764}
765
766impl From<ShreddedVariantFieldArray> for ArrayRef {
767    fn from(array: ShreddedVariantFieldArray) -> Self {
768        Arc::new(array.into_inner())
769    }
770}
771
772impl From<ShreddedVariantFieldArray> for StructArray {
773    fn from(array: ShreddedVariantFieldArray) -> Self {
774        array.into_inner()
775    }
776}
777
778/// Represents the shredding state of a [`VariantArray`]
779///
780/// [`VariantArray`]s can be shredded according to the [Parquet Variant
781/// Shredding Spec]. Shredding means that the actual value is stored in a typed
782/// `typed_field` instead of the generic `value` field.
783///
784/// Both value and typed_value are optional fields used together to encode a
785/// single value. Values in the two fields must be interpreted according to the
786/// following table (see [Parquet Variant Shredding Spec] for more details):
787///
788/// | value    | typed_value  | Meaning |
789/// |----------|--------------|---------|
790/// | NULL     | NULL         | The value is missing; only valid for shredded object fields |
791/// | non-NULL | NULL         | The value is present and may be any type, including [`Variant::Null`] |
792/// | NULL     | non-NULL     | The value is present and is the shredded type |
793/// | non-NULL | non-NULL     | The value is present and is a partially shredded object |
794///
795///
796/// Applying the above rules to entire columns, we obtain the following:
797///
798/// | value  | typed_value  | Meaning |
799/// |--------|-------------|---------|
800/// | --     | --          | **Missing**: The value is always missing; only valid for shredded object fields |
801/// | exists | --          | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
802/// | --     | exists      | **Perfectly shredded**: If present, the value is always the shredded type |
803/// | exists | exists      | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
804///
805/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
806/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
807/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
808/// (partial shredding).
809///
810/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
811#[derive(Debug, Clone)]
812pub struct ShreddingState {
813    value: Option<ArrayRef>,
814    typed_value: Option<ArrayRef>,
815}
816
817impl ShreddingState {
818    /// Create a new `ShreddingState` from the given `value` and `typed_value` fields
819    ///
820    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
821    /// `ShreddingState::try_from(&struct_array)`, for example:
822    ///
823    /// ```no_run
824    /// # use arrow::array::StructArray;
825    /// # use parquet_variant_compute::ShreddingState;
826    /// # fn get_struct_array() -> StructArray {
827    /// #   unimplemented!()
828    /// # }
829    /// let struct_array: StructArray = get_struct_array();
830    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
831    /// ```
832    pub fn new(value: Option<ArrayRef>, typed_value: Option<ArrayRef>) -> Self {
833        Self { value, typed_value }
834    }
835
836    /// Return a reference to the value field, if present
837    pub fn value_field(&self) -> Option<&ArrayRef> {
838        self.value.as_ref()
839    }
840
841    /// Return a reference to the typed_value field, if present
842    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
843        self.typed_value.as_ref()
844    }
845
846    /// Returns a borrowed version of this shredding state
847    pub fn borrow(&self) -> BorrowedShreddingState<'_> {
848        BorrowedShreddingState {
849            value: self.value_field(),
850            typed_value: self.typed_value_field(),
851        }
852    }
853
854    /// Slice all the underlying arrays
855    pub fn slice(&self, offset: usize, length: usize) -> Self {
856        Self {
857            value: self.value.as_ref().map(|v| v.slice(offset, length)),
858            typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
859        }
860    }
861}
862
863/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
864/// for avoiding clone operations when the caller does not need a self-standing shredding state.
865#[derive(Clone, Debug)]
866pub struct BorrowedShreddingState<'a> {
867    value: Option<&'a ArrayRef>,
868    typed_value: Option<&'a ArrayRef>,
869}
870
871impl<'a> BorrowedShreddingState<'a> {
872    /// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
873    ///
874    /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
875    /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
876    ///
877    /// ```no_run
878    /// # use arrow::array::StructArray;
879    /// # use parquet_variant_compute::BorrowedShreddingState;
880    /// # fn get_struct_array() -> StructArray {
881    /// #   unimplemented!()
882    /// # }
883    /// let struct_array: StructArray = get_struct_array();
884    /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
885    /// ```
886    pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>) -> Self {
887        Self { value, typed_value }
888    }
889
890    /// Return a reference to the value field, if present
891    pub fn value_field(&self) -> Option<&'a ArrayRef> {
892        self.value
893    }
894
895    /// Return a reference to the typed_value field, if present
896    pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
897        self.typed_value
898    }
899}
900
901impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
902    type Error = ArrowError;
903
904    fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
905        // The `value` column need not exist, but if it does it must be a binary type.
906        let value = if let Some(value_col) = inner_struct.column_by_name("value") {
907            validate_binary_array(value_col.as_ref(), "value")?;
908            Some(value_col)
909        } else {
910            None
911        };
912        let typed_value = inner_struct.column_by_name("typed_value");
913        Ok(BorrowedShreddingState::new(value, typed_value))
914    }
915}
916
917impl TryFrom<&StructArray> for ShreddingState {
918    type Error = ArrowError;
919
920    fn try_from(inner_struct: &StructArray) -> Result<Self> {
921        Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
922    }
923}
924
925impl From<BorrowedShreddingState<'_>> for ShreddingState {
926    fn from(state: BorrowedShreddingState<'_>) -> Self {
927        ShreddingState {
928            value: state.value_field().cloned(),
929            typed_value: state.typed_value_field().cloned(),
930        }
931    }
932}
933
934/// Builds struct arrays from component fields
935///
936/// TODO: move to arrow crate
937#[derive(Debug, Default, Clone)]
938pub(crate) struct StructArrayBuilder {
939    fields: Vec<FieldRef>,
940    arrays: Vec<ArrayRef>,
941    nulls: Option<NullBuffer>,
942}
943
944impl StructArrayBuilder {
945    pub fn new() -> Self {
946        Default::default()
947    }
948
949    /// Add an array to this struct array as a field with the specified name.
950    pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
951        let field = Field::new(field_name, array.data_type().clone(), nullable);
952        self.fields.push(Arc::new(field));
953        self.arrays.push(array);
954        self
955    }
956
957    /// Set the null buffer for this struct array.
958    pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
959        self.nulls = Some(nulls);
960        self
961    }
962
963    pub fn build(self) -> StructArray {
964        let Self {
965            fields,
966            arrays,
967            nulls,
968        } = self;
969        StructArray::new(Fields::from(fields), arrays, nulls)
970    }
971}
972
973/// returns the non-null element at index as a Variant
974fn typed_value_to_variant<'a>(
975    typed_value: &'a ArrayRef,
976    value: Option<&'a ArrayRef>,
977    index: usize,
978) -> Result<Variant<'a, 'a>> {
979    let data_type = typed_value.data_type();
980    if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
981        // Only a partially shredded struct is allowed to have values for both columns
982        panic!("Invalid variant, conflicting value and typed_value");
983    }
984    match data_type {
985        DataType::Null => Ok(Variant::Null),
986        DataType::Boolean => {
987            let boolean_array = typed_value.as_boolean();
988            let value = boolean_array.value(index);
989            Ok(Variant::from(value))
990        }
991        // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
992        DataType::FixedSizeBinary(16) => {
993            let array = typed_value.as_fixed_size_binary();
994            let value = array.value(index);
995            Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes
996        }
997        DataType::Binary => {
998            let array = typed_value.as_binary::<i32>();
999            let value = array.value(index);
1000            Ok(Variant::from(value))
1001        }
1002        DataType::LargeBinary => {
1003            let array = typed_value.as_binary::<i64>();
1004            let value = array.value(index);
1005            Ok(Variant::from(value))
1006        }
1007        DataType::BinaryView => {
1008            let array = typed_value.as_binary_view();
1009            let value = array.value(index);
1010            Ok(Variant::from(value))
1011        }
1012        DataType::Utf8 => {
1013            let array = typed_value.as_string::<i32>();
1014            let value = array.value(index);
1015            Ok(Variant::from(value))
1016        }
1017        DataType::LargeUtf8 => {
1018            let array = typed_value.as_string::<i64>();
1019            let value = array.value(index);
1020            Ok(Variant::from(value))
1021        }
1022        DataType::Utf8View => {
1023            let array = typed_value.as_string_view();
1024            let value = array.value(index);
1025            Ok(Variant::from(value))
1026        }
1027        DataType::Int8 => {
1028            primitive_conversion_single_value!(Int8Type, typed_value, index)
1029        }
1030        DataType::Int16 => {
1031            primitive_conversion_single_value!(Int16Type, typed_value, index)
1032        }
1033        DataType::Int32 => {
1034            primitive_conversion_single_value!(Int32Type, typed_value, index)
1035        }
1036        DataType::Int64 => {
1037            primitive_conversion_single_value!(Int64Type, typed_value, index)
1038        }
1039        DataType::Float16 => {
1040            primitive_conversion_single_value!(Float16Type, typed_value, index)
1041        }
1042        DataType::Float32 => {
1043            primitive_conversion_single_value!(Float32Type, typed_value, index)
1044        }
1045        DataType::Float64 => {
1046            primitive_conversion_single_value!(Float64Type, typed_value, index)
1047        }
1048        DataType::Decimal32(_, s) => {
1049            generic_conversion_single_value_with_result!(
1050                Decimal32Type,
1051                as_primitive,
1052                |v| VariantDecimal4::try_new(v, *s as u8),
1053                typed_value,
1054                index
1055            )
1056        }
1057        DataType::Decimal64(_, s) => {
1058            generic_conversion_single_value_with_result!(
1059                Decimal64Type,
1060                as_primitive,
1061                |v| VariantDecimal8::try_new(v, *s as u8),
1062                typed_value,
1063                index
1064            )
1065        }
1066        DataType::Decimal128(_, s) => {
1067            generic_conversion_single_value_with_result!(
1068                Decimal128Type,
1069                as_primitive,
1070                |v| VariantDecimal16::try_new(v, *s as u8),
1071                typed_value,
1072                index
1073            )
1074        }
1075        DataType::Date32 => {
1076            generic_conversion_single_value!(
1077                Date32Type,
1078                as_primitive,
1079                |v| Date32Type::to_naive_date_opt(v).unwrap(),
1080                typed_value,
1081                index
1082            )
1083        }
1084        DataType::Time64(TimeUnit::Microsecond) => {
1085            generic_conversion_single_value_with_result!(
1086                Time64MicrosecondType,
1087                as_primitive,
1088                |v| NaiveTime::from_num_seconds_from_midnight_opt(
1089                    (v / 1_000_000) as u32,
1090                    (v % 1_000_000) as u32 * 1000
1091                )
1092                .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1093                typed_value,
1094                index
1095            )
1096        }
1097        DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1098            generic_conversion_single_value!(
1099                TimestampMicrosecondType,
1100                as_primitive,
1101                |v| DateTime::from_timestamp_micros(v).unwrap(),
1102                typed_value,
1103                index
1104            )
1105        }
1106        DataType::Timestamp(TimeUnit::Microsecond, None) => {
1107            generic_conversion_single_value!(
1108                TimestampMicrosecondType,
1109                as_primitive,
1110                |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1111                typed_value,
1112                index
1113            )
1114        }
1115        DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1116            generic_conversion_single_value!(
1117                TimestampNanosecondType,
1118                as_primitive,
1119                DateTime::from_timestamp_nanos,
1120                typed_value,
1121                index
1122            )
1123        }
1124        DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1125            generic_conversion_single_value!(
1126                TimestampNanosecondType,
1127                as_primitive,
1128                |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1129                typed_value,
1130                index
1131            )
1132        }
1133        // todo other types here (note this is very similar to cast_to_variant.rs)
1134        // so it would be great to figure out how to share this code
1135        _ => {
1136            // We shouldn't panic in production code, but this is a
1137            // placeholder until we implement more types
1138            // https://github.com/apache/arrow-rs/issues/8091
1139            debug_assert!(
1140                false,
1141                "Unsupported typed_value type: {}",
1142                typed_value.data_type()
1143            );
1144            Ok(Variant::Null)
1145        }
1146    }
1147}
1148
1149/// Canonicalize shredded typed_value fields (e.g. decimal narrowing) and
1150/// verify that all data types in the struct are legal for a variant array.
1151fn canonicalize_shredded_types(array: &dyn Array) -> Result<ArrayRef> {
1152    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1153    if let Cow::Borrowed(_) = new_type {
1154        if let Some(array) = array.as_struct_opt() {
1155            return Ok(Arc::new(array.clone())); // bypass the unnecessary cast
1156        }
1157    }
1158    cast(array, new_type.as_ref())
1159}
1160
1161/// Recursively visits a data type, ensuring that it only contains data types that can legally
1162/// appear in a (possibly shredded) variant array. It also narrows decimal types to the smallest
1163/// valid precision (e.g. Decimal128 -> Decimal32 when the precision fits).
1164fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1165    use DataType::*;
1166
1167    // helper macros
1168    macro_rules! fail {
1169        () => {
1170            return Err(ArrowError::InvalidArgumentError(format!(
1171                "Illegal shredded value type: {data_type}"
1172            )))
1173        };
1174    }
1175    macro_rules! borrow {
1176        () => {
1177            Cow::Borrowed(data_type)
1178        };
1179    }
1180
1181    let new_data_type = match data_type {
1182        // Primitive arrow types that have a direct variant counterpart are allowed
1183        Null | Boolean => borrow!(),
1184        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1185
1186        // Unsigned integers and half-float are not allowed
1187        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1188
1189        // Most decimal types are allowed, with restrictions on precision and scale
1190        //
1191        // NOTE: arrow-parquet reads widens 32- and 64-bit decimals to 128-bit, but the variant spec
1192        // requires using the narrowest decimal type for a given precision. Fix those up first.
1193        Decimal64(p, s) | Decimal128(p, s)
1194            if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1195        {
1196            Cow::Owned(Decimal32(*p, *s))
1197        }
1198        Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1199            Cow::Owned(Decimal64(*p, *s))
1200        }
1201        Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1202        Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1203        Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1204        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1205
1206        // Only micro and nano timestamps are allowed
1207        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1208        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1209
1210        // Only 32-bit dates and 64-bit microsecond time are allowed.
1211        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1212        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1213
1214        // Binary, string, and their view counterparts are allowed.
1215        Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1216
1217        // UUID maps to 16-byte fixed-size binary; no other width is allowed
1218        FixedSizeBinary(16) => borrow!(),
1219        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1220
1221        // List-like containers and struct are allowed, maps and unions are not
1222        List(field) => match canonicalize_and_verify_field(field)? {
1223            Cow::Borrowed(_) => borrow!(),
1224            Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1225        },
1226        LargeList(field) => match canonicalize_and_verify_field(field)? {
1227            Cow::Borrowed(_) => borrow!(),
1228            Cow::Owned(new_field) => Cow::Owned(DataType::LargeList(new_field)),
1229        },
1230        ListView(field) => match canonicalize_and_verify_field(field)? {
1231            Cow::Borrowed(_) => borrow!(),
1232            Cow::Owned(new_field) => Cow::Owned(DataType::ListView(new_field)),
1233        },
1234        LargeListView(field) => match canonicalize_and_verify_field(field)? {
1235            Cow::Borrowed(_) => borrow!(),
1236            Cow::Owned(new_field) => Cow::Owned(DataType::LargeListView(new_field)),
1237        },
1238        // Struct is used by the internal layout, and can also represent a shredded variant object.
1239        Struct(fields) => {
1240            // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning
1241            // of the data type. Even if some fields change, the others are shallow arc clones.
1242            let mut new_fields = std::collections::HashMap::new();
1243            for (i, field) in fields.iter().enumerate() {
1244                if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1245                    new_fields.insert(i, new_field);
1246                }
1247            }
1248
1249            if new_fields.is_empty() {
1250                borrow!()
1251            } else {
1252                let new_fields = fields
1253                    .iter()
1254                    .enumerate()
1255                    .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1256                Cow::Owned(DataType::Struct(new_fields.collect()))
1257            }
1258        }
1259        Map(..) | Union(..) => fail!(),
1260
1261        // We can _possibly_ support (some of) these some day?
1262        Dictionary(..) | RunEndEncoded(..) => fail!(),
1263    };
1264    Ok(new_data_type)
1265}
1266
1267fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1268    let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1269        return Ok(Cow::Borrowed(field));
1270    };
1271    let new_field = field.as_ref().clone().with_data_type(new_data_type);
1272    Ok(Cow::Owned(Arc::new(new_field)))
1273}
1274
1275#[cfg(test)]
1276mod test {
1277    use crate::VariantArrayBuilder;
1278    use std::str::FromStr;
1279
1280    use super::*;
1281    use arrow::array::{
1282        BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array,
1283        Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, ListArray, ListViewArray,
1284        Time64MicrosecondArray,
1285    };
1286    use arrow::buffer::{OffsetBuffer, ScalarBuffer};
1287    use arrow_schema::{Field, Fields};
1288    use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1289
1290    #[test]
1291    fn invalid_not_a_struct_array() {
1292        let array = make_binary_view_array();
1293        // Should fail because the input is not a StructArray
1294        let err = VariantArray::try_new(&array);
1295        assert_eq!(
1296            err.unwrap_err().to_string(),
1297            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1298        );
1299    }
1300
1301    #[test]
1302    fn invalid_missing_metadata() {
1303        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1304        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1305        // Should fail because the StructArray does not contain a 'metadata' field
1306        let err = VariantArray::try_new(&array);
1307        assert_eq!(
1308            err.unwrap_err().to_string(),
1309            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1310        );
1311    }
1312
1313    #[test]
1314    fn all_null_missing_value_and_typed_value() {
1315        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1316        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1317
1318        // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
1319        // should be invalid, but we currently allow it and treat it as Variant::Null.
1320        // This is a pragmatic decision to handle missing data gracefully.
1321        let variant_array = VariantArray::try_new(&array).unwrap();
1322
1323        // Verify the shredding state is AllNull
1324        assert!(matches!(
1325            variant_array.shredding_state(),
1326            ShreddingState {
1327                value: None,
1328                typed_value: None
1329            }
1330        ));
1331
1332        // Verify that value() returns Variant::Null (compensating for spec violation)
1333        for i in 0..variant_array.len() {
1334            if variant_array.is_valid(i) {
1335                assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1336            }
1337        }
1338    }
1339
1340    #[test]
1341    fn invalid_metadata_field_type() {
1342        let fields = Fields::from(vec![
1343            Field::new("metadata", DataType::Int32, true), // not supported
1344            Field::new("value", DataType::BinaryView, true),
1345        ]);
1346        let array = StructArray::new(
1347            fields,
1348            vec![make_int32_array(), make_binary_view_array()],
1349            None,
1350        );
1351        let err = VariantArray::try_new(&array);
1352        assert_eq!(
1353            err.unwrap_err().to_string(),
1354            "Invalid argument error: VariantArray 'metadata' field must be Binary, LargeBinary, or BinaryView, got Int32"
1355        );
1356    }
1357
1358    #[test]
1359    fn invalid_value_field_type() {
1360        let fields = Fields::from(vec![
1361            Field::new("metadata", DataType::BinaryView, true),
1362            Field::new("value", DataType::Int32, true),
1363        ]);
1364        let array = StructArray::new(
1365            fields,
1366            vec![make_binary_view_array(), make_int32_array()],
1367            None,
1368        );
1369        let err = VariantArray::try_new(&array);
1370        assert_eq!(
1371            err.unwrap_err().to_string(),
1372            "Invalid argument error: VariantArray 'value' field must be Binary, LargeBinary, or BinaryView, got Int32"
1373        );
1374    }
1375
1376    fn make_binary_view_array() -> ArrayRef {
1377        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1378    }
1379
1380    fn make_int32_array() -> ArrayRef {
1381        Arc::new(Int32Array::from(vec![1]))
1382    }
1383
1384    fn make_variant_struct_with_typed_value(typed_value: ArrayRef) -> StructArray {
1385        let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1386            EMPTY_VARIANT_METADATA_BYTES,
1387            typed_value.len(),
1388        ));
1389        StructArrayBuilder::new()
1390            .with_field("metadata", Arc::new(metadata), false)
1391            .with_field("typed_value", typed_value, true)
1392            .build()
1393    }
1394
1395    #[test]
1396    fn all_null_shredding_state() {
1397        // Verify the shredding state is AllNull
1398        assert!(matches!(
1399            ShreddingState::new(None, None),
1400            ShreddingState {
1401                value: None,
1402                typed_value: None
1403            }
1404        ));
1405    }
1406
1407    #[test]
1408    fn all_null_variant_array_construction() {
1409        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1410        let nulls = NullBuffer::from(vec![false, false, false]); // all null
1411
1412        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1413        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1414
1415        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1416
1417        // Verify the shredding state is AllNull
1418        assert!(matches!(
1419            variant_array.shredding_state(),
1420            ShreddingState {
1421                value: None,
1422                typed_value: None
1423            }
1424        ));
1425
1426        // Verify all values are null
1427        assert_eq!(variant_array.len(), 3);
1428        assert!(!variant_array.is_valid(0));
1429        assert!(!variant_array.is_valid(1));
1430        assert!(!variant_array.is_valid(2));
1431
1432        // Verify that value() returns Variant::Null for all indices
1433        for i in 0..variant_array.len() {
1434            assert!(
1435                !variant_array.is_valid(i),
1436                "Expected value at index {i} to be null"
1437            );
1438        }
1439    }
1440
1441    #[test]
1442    fn value_field_present_but_all_null_should_be_unshredded() {
1443        // This test demonstrates the issue: when a value field exists in schema
1444        // but all its values are null, it should remain Unshredded, not AllNull
1445        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1446
1447        // Create a value field with all null values
1448        let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
1449        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1450        let value_data = value_array
1451            .to_data()
1452            .into_builder()
1453            .nulls(Some(value_nulls))
1454            .build()
1455            .unwrap();
1456        let value = BinaryViewArray::from(value_data);
1457
1458        let fields = Fields::from(vec![
1459            Field::new("metadata", DataType::BinaryView, false),
1460            Field::new("value", DataType::BinaryView, true), // Field exists in schema
1461        ]);
1462        let struct_array = StructArray::new(
1463            fields,
1464            vec![Arc::new(metadata), Arc::new(value)],
1465            None, // struct itself is not null, just the value field is all null
1466        );
1467
1468        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1469
1470        // This should be Unshredded, not AllNull, because value field exists in schema
1471        assert!(matches!(
1472            variant_array.shredding_state(),
1473            ShreddingState {
1474                value: Some(_),
1475                typed_value: None
1476            }
1477        ));
1478    }
1479
1480    #[test]
1481    fn canonicalize_and_verify_list_like_data_types() {
1482        // `parquet/tests/variant_integration.rs` validates Parquet shredded-variant fixtures that
1483        // use Parquet LIST encoding, but those fixtures do not cover Arrow-specific list container
1484        // variants (`LargeList`, `ListView`, `LargeListView`) accepted by `VariantArray::try_new`.
1485        let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true));
1486        let make_large_binary = || Arc::new(Field::new("item", DataType::LargeBinary, true));
1487        let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true));
1488
1489        let cases = vec![
1490            // Binary item
1491            DataType::LargeList(make_item_binary()),
1492            DataType::ListView(make_item_binary()),
1493            DataType::LargeListView(make_item_binary()),
1494            // Large binary item
1495            DataType::LargeList(make_large_binary()),
1496            DataType::ListView(make_large_binary()),
1497            DataType::LargeListView(make_large_binary()),
1498            // Binary view item
1499            DataType::LargeList(make_item_binary_view()),
1500            DataType::ListView(make_item_binary_view()),
1501            DataType::LargeListView(make_item_binary_view()),
1502        ];
1503
1504        for input in cases {
1505            assert_eq!(
1506                canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
1507                &input
1508            );
1509        }
1510    }
1511
1512    #[test]
1513    fn variant_array_try_new_supports_list_like_typed_value() {
1514        let item_field = Arc::new(Field::new("item", DataType::Int64, true));
1515        let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
1516
1517        let typed_values = vec![
1518            Arc::new(ListArray::new(
1519                item_field.clone(),
1520                OffsetBuffer::new(ScalarBuffer::from(vec![0, 2, 3])),
1521                values.clone(),
1522                None,
1523            )) as ArrayRef,
1524            Arc::new(LargeListArray::new(
1525                item_field.clone(),
1526                OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 3])),
1527                values.clone(),
1528                None,
1529            )) as ArrayRef,
1530            Arc::new(ListViewArray::new(
1531                item_field.clone(),
1532                ScalarBuffer::from(vec![0, 2]),
1533                ScalarBuffer::from(vec![2, 1]),
1534                values.clone(),
1535                None,
1536            )) as ArrayRef,
1537            Arc::new(LargeListViewArray::new(
1538                item_field,
1539                ScalarBuffer::from(vec![0_i64, 2]),
1540                ScalarBuffer::from(vec![2_i64, 1]),
1541                values,
1542                None,
1543            )) as ArrayRef,
1544        ];
1545
1546        for typed_value in typed_values {
1547            let input = make_variant_struct_with_typed_value(typed_value.clone());
1548            let variant_array = VariantArray::try_new(&input).unwrap();
1549            assert_eq!(
1550                variant_array.typed_value_field().unwrap().data_type(),
1551                typed_value.data_type(),
1552            );
1553        }
1554    }
1555
1556    #[test]
1557    fn test_variant_array_iterable() {
1558        let mut b = VariantArrayBuilder::new(6);
1559
1560        b.append_null();
1561        b.append_variant(Variant::from(1_i8));
1562        b.append_variant(Variant::Null);
1563        b.append_variant(Variant::from(2_i32));
1564        b.append_variant(Variant::from(3_i64));
1565        b.append_null();
1566
1567        let v = b.build();
1568
1569        let variants = v.iter().collect::<Vec<_>>();
1570
1571        assert_eq!(
1572            variants,
1573            vec![
1574                None,
1575                Some(Variant::Int8(1)),
1576                Some(Variant::Null),
1577                Some(Variant::Int32(2)),
1578                Some(Variant::Int64(3)),
1579                None,
1580            ]
1581        );
1582    }
1583
1584    #[test]
1585    fn test_variant_array_iter_double_ended() {
1586        let mut b = VariantArrayBuilder::new(5);
1587
1588        b.append_variant(Variant::from(0_i32));
1589        b.append_null();
1590        b.append_variant(Variant::from(2_i32));
1591        b.append_null();
1592        b.append_variant(Variant::from(4_i32));
1593
1594        let array = b.build();
1595        let mut iter = array.iter();
1596
1597        assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1598        assert_eq!(iter.next(), Some(None));
1599
1600        assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1601        assert_eq!(iter.next_back(), Some(None));
1602        assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1603
1604        assert_eq!(iter.next_back(), None);
1605        assert_eq!(iter.next(), None);
1606    }
1607
1608    #[test]
1609    fn test_variant_array_iter_reverse() {
1610        let mut b = VariantArrayBuilder::new(5);
1611
1612        b.append_variant(Variant::from("a"));
1613        b.append_null();
1614        b.append_variant(Variant::from("aaa"));
1615        b.append_null();
1616        b.append_variant(Variant::from("aaaaa"));
1617
1618        let array = b.build();
1619
1620        let result: Vec<_> = array.iter().rev().collect();
1621        assert_eq!(
1622            result,
1623            vec![
1624                Some(Variant::from("aaaaa")),
1625                None,
1626                Some(Variant::from("aaa")),
1627                None,
1628                Some(Variant::from("a")),
1629            ]
1630        );
1631    }
1632
1633    #[test]
1634    fn test_variant_array_iter_empty() {
1635        let v = VariantArrayBuilder::new(0).build();
1636        let mut i = v.iter();
1637        assert!(i.next().is_none());
1638        assert!(i.next_back().is_none());
1639    }
1640
1641    #[test]
1642    fn test_from_variant_opts_into_variant_array() {
1643        let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1644
1645        let variant_array = VariantArray::from_iter(v);
1646
1647        assert_eq!(variant_array.len(), 4);
1648
1649        assert!(variant_array.is_null(0));
1650
1651        assert!(!variant_array.is_null(1));
1652        assert_eq!(variant_array.value(1), Variant::Null);
1653
1654        assert!(!variant_array.is_null(2));
1655        assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1656
1657        assert!(variant_array.is_null(3));
1658    }
1659
1660    #[test]
1661    fn test_from_variants_into_variant_array() {
1662        let v = vec![
1663            Variant::Null,
1664            Variant::BooleanFalse,
1665            Variant::ShortString(ShortString::try_new("norm").unwrap()),
1666        ];
1667
1668        let variant_array = VariantArray::from_iter(v);
1669
1670        assert_eq!(variant_array.len(), 3);
1671
1672        assert!(!variant_array.is_null(0));
1673        assert_eq!(variant_array.value(0), Variant::Null);
1674
1675        assert!(!variant_array.is_null(1));
1676        assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1677
1678        assert!(!variant_array.is_null(2));
1679        assert_eq!(
1680            variant_array.value(2),
1681            Variant::ShortString(ShortString::try_new("norm").unwrap())
1682        );
1683    }
1684
1685    #[test]
1686    fn test_variant_equality() {
1687        let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1688        let v = VariantArray::from_iter(v_iter.clone());
1689
1690        {
1691            let v_copy = v.clone();
1692            assert_eq!(v, v_copy);
1693        }
1694
1695        {
1696            let v_iter_reversed = v_iter.iter().cloned().rev();
1697            let v_reversed = VariantArray::from_iter(v_iter_reversed);
1698
1699            assert_ne!(v, v_reversed);
1700        }
1701
1702        {
1703            let v_sliced = v.slice(0, 1);
1704            assert_ne!(v, v_sliced);
1705        }
1706    }
1707
1708    #[test]
1709    fn binary_typed_value_roundtrips() {
1710        // Verify that a shredded variant with Binary typed_value can be read back
1711        let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1712            EMPTY_VARIANT_METADATA_BYTES,
1713        ]));
1714        let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello" as &[u8]]));
1715
1716        let struct_array = StructArrayBuilder::new()
1717            .with_field("metadata", metadata, false)
1718            .with_field("typed_value", typed_value, true)
1719            .build();
1720
1721        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1722        assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8]));
1723    }
1724
1725    #[test]
1726    fn large_binary_typed_value_roundtrips() {
1727        // Verify that a shredded variant with LargeBinary typed_value can be read back
1728        let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1729            EMPTY_VARIANT_METADATA_BYTES,
1730        ]));
1731        let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]]));
1732
1733        let struct_array = StructArrayBuilder::new()
1734            .with_field("metadata", metadata, false)
1735            .with_field("typed_value", typed_value, true)
1736            .build();
1737
1738        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1739        assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8]));
1740    }
1741
1742    macro_rules! invalid_variant_array_test {
1743        ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1744            #[test]
1745            fn $fn_name() {
1746                let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1747                    EMPTY_VARIANT_METADATA_BYTES,
1748                    1,
1749                ));
1750                let invalid_typed_value = $invalid_typed_value;
1751
1752                let struct_array = StructArrayBuilder::new()
1753                    .with_field("metadata", Arc::new(metadata), false)
1754                    .with_field("typed_value", Arc::new(invalid_typed_value), true)
1755                    .build();
1756
1757                let array: VariantArray = VariantArray::try_new(&struct_array)
1758                    .expect("should create variant array")
1759                    .into();
1760
1761                let result = array.try_value(0);
1762                assert!(result.is_err());
1763                let error = result.unwrap_err();
1764                assert!(matches!(error, ArrowError::CastError(_)));
1765
1766                let expected: &str = $error_msg;
1767                assert!(
1768                    error.to_string().contains($error_msg),
1769                    "error `{}` did not contain `{}`",
1770                    error,
1771                    expected
1772                )
1773            }
1774        };
1775    }
1776
1777    invalid_variant_array_test!(
1778        test_variant_array_invalide_time,
1779        Time64MicrosecondArray::from(vec![Some(86401000000)]),
1780        "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1781    );
1782
1783    invalid_variant_array_test!(
1784        test_variant_array_invalid_decimal32,
1785        Decimal32Array::from(vec![Some(1234567890)]),
1786        "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1787    );
1788
1789    invalid_variant_array_test!(
1790        test_variant_array_invalid_decimal64,
1791        Decimal64Array::from(vec![Some(1234567890123456789)]),
1792        "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1793    );
1794
1795    invalid_variant_array_test!(
1796        test_variant_array_invalid_decimal128,
1797        Decimal128Array::from(vec![Some(
1798            i128::from_str("123456789012345678901234567890123456789").unwrap()
1799        ),]),
1800        "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1801    );
1802}