parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value};
21use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
22use arrow::buffer::NullBuffer;
23use arrow::compute::cast;
24use arrow::datatypes::{
25    Date32Type, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type,
26    TimestampMicrosecondType, TimestampNanosecondType,
27};
28use arrow_schema::extension::ExtensionType;
29use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
30use chrono::DateTime;
31use parquet_variant::Uuid;
32use parquet_variant::Variant;
33
34use std::borrow::Cow;
35use std::sync::Arc;
36
37/// Arrow Variant [`ExtensionType`].
38///
39/// Represents the canonical Arrow Extension Type for storing variants.
40/// See [`VariantArray`] for more examples of using this extension type.
41pub struct VariantType;
42
43impl ExtensionType for VariantType {
44    const NAME: &'static str = "arrow.parquet.variant";
45
46    // Variants extension metadata is an empty string
47    // <https://github.com/apache/arrow/blob/d803afcc43f5d132506318fd9e162d33b2c3d4cd/docs/source/format/CanonicalExtensions.rst?plain=1#L473>
48    type Metadata = &'static str;
49
50    fn metadata(&self) -> &Self::Metadata {
51        &""
52    }
53
54    fn serialize_metadata(&self) -> Option<String> {
55        Some(String::new())
56    }
57
58    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
59        Ok("")
60    }
61
62    fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
63        if matches!(data_type, DataType::Struct(_)) {
64            Ok(())
65        } else {
66            Err(ArrowError::InvalidArgumentError(format!(
67                "VariantType only supports StructArray, got {data_type}"
68            )))
69        }
70    }
71
72    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
73        Self.supports_data_type(data_type)?;
74        Ok(Self)
75    }
76}
77
78/// An array of Parquet [`Variant`] values
79///
80/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
81/// `metadata` and `value` fields, and adds convenience methods to access
82/// the [`Variant`]s.
83///
84/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row.
85///
86/// See the examples below from converting between `VariantArray` and
87/// `StructArray`.
88///
89/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
90///
91/// # Documentation
92///
93/// At the time of this writing, Variant has been accepted as an official
94/// extension type but not been published to the [official list of extension
95/// types] on the Apache Arrow website. See the [Extension Type for Parquet
96/// Variant arrow] ticket for more details.
97///
98/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
99/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
100///
101/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension
102///
103/// Arrow Arrays only provide [`DataType`], but the extension type information
104/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or
105/// [`Field`] to check for the extension type.
106///
107/// [`Schema`]: arrow_schema::Schema
108/// ```
109/// # use arrow::array::StructArray;
110/// # use arrow_schema::{Schema, Field, DataType};
111/// # use parquet_variant::Variant;
112/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
113/// # fn get_variant_array() -> VariantArray {
114/// #   let mut builder = VariantArrayBuilder::new(10);
115/// #   builder.append_variant(Variant::from("such wow"));
116/// #   builder.build()
117/// # }
118/// # fn get_schema() -> Schema {
119/// #   Schema::new(vec![
120/// #     Field::new("id", DataType::Int32, false),
121/// #     get_variant_array().field("var"),
122/// #   ])
123/// # }
124/// let schema = get_schema();
125/// assert_eq!(schema.fields().len(), 2);
126/// // first field is not a Variant
127/// assert!(schema.field(0).try_extension_type::<VariantType>().is_err());
128/// // second field is a Variant
129/// assert!(schema.field(1).try_extension_type::<VariantType>().is_ok());
130/// ```
131///
132/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
133///
134/// You can construct the correct [`Field`] for a [`VariantArray`] using the
135/// [`VariantArray::field`] method.
136///
137/// ```
138/// # use arrow_schema::{Schema, Field, DataType};
139/// # use parquet_variant::Variant;
140/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
141/// # fn get_variant_array() -> VariantArray {
142/// #   let mut builder = VariantArrayBuilder::new(10);
143/// #   builder.append_variant(Variant::from("such wow"));
144/// #   builder.build()
145/// # }
146/// let variant_array = get_variant_array();
147/// // First field is an integer id, second field is a variant
148/// let schema = Schema::new(vec![
149///   Field::new("id", DataType::Int32, false),
150///   // call VariantArray::field to get the correct Field
151///   variant_array.field("var"),
152/// ]);
153/// ```
154///
155/// You can also construct the [`Field`] using [`VariantType`] directly
156///
157/// ```
158/// # use arrow_schema::{Schema, Field, DataType};
159/// # use parquet_variant::Variant;
160/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
161/// # fn get_variant_array() -> VariantArray {
162/// #   let mut builder = VariantArrayBuilder::new(10);
163/// #   builder.append_variant(Variant::from("such wow"));
164/// #   builder.build()
165/// # }
166/// # let variant_array = get_variant_array();
167/// // The DataType of a VariantArray varies depending on how it is shredded
168/// let data_type = variant_array.data_type().clone();
169/// // First field is an integer id, second field is a variant
170/// let schema = Schema::new(vec![
171///   Field::new("id", DataType::Int32, false),
172///   Field::new("var", data_type, false)
173///     // Add extension metadata to the field using `VariantType`
174///     .with_extension_type(VariantType),
175/// ]);
176/// ```
177///
178/// # Example: Converting a [`VariantArray`] to a [`StructArray`]
179///
180/// ```
181/// # use arrow::array::StructArray;
182/// # use parquet_variant::Variant;
183/// # use parquet_variant_compute::VariantArrayBuilder;
184/// // Create Variant Array
185/// let mut builder = VariantArrayBuilder::new(10);
186/// builder.append_variant(Variant::from("such wow"));
187/// let variant_array = builder.build();
188/// // convert to StructArray
189/// let struct_array: StructArray = variant_array.into();
190/// ```
191///
192/// # Example: Converting a [`StructArray`] to a [`VariantArray`]
193///
194/// ```
195/// # use arrow::array::StructArray;
196/// # use parquet_variant::Variant;
197/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray};
198/// # fn get_struct_array() -> StructArray {
199/// #   let mut builder = VariantArrayBuilder::new(10);
200/// #   builder.append_variant(Variant::from("such wow"));
201/// #   builder.build().into()
202/// # }
203/// let struct_array: StructArray = get_struct_array();
204/// // try and create a VariantArray from it
205/// let variant_array = VariantArray::try_new(&struct_array).unwrap();
206/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
207/// ```
208///
209#[derive(Clone, Debug)]
210pub struct VariantArray {
211    /// Reference to the underlying StructArray
212    inner: StructArray,
213
214    /// The metadata column of this variant
215    metadata: BinaryViewArray,
216
217    /// how is this variant array shredded?
218    shredding_state: ShreddingState,
219}
220
221impl VariantArray {
222    /// Creates a new `VariantArray` from a [`StructArray`].
223    ///
224    /// # Arguments
225    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
226    ///
227    /// # Returns
228    /// - A new instance of `VariantArray`.
229    ///
230    /// # Errors:
231    /// - If the `StructArray` does not contain the required fields
232    ///
233    /// # Requirements of the `StructArray`
234    ///
235    /// 1. A required field named `metadata` which is binary, large_binary, or
236    ///    binary_view
237    ///
238    /// 2. An optional field named `value` that is binary, large_binary, or
239    ///    binary_view
240    ///
241    /// 3. An optional field named `typed_value` which can be any primitive type
242    ///    or be a list, large_list, list_view or struct
243    ///
244    /// NOTE: It is also permissible for the metadata field to be
245    /// Dictionary-Encoded, preferably (but not required) with an index type of
246    /// int8.
247    ///
248    /// Currently, only [`BinaryViewArray`] are supported.
249    pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
250        // Workaround lack of support for Binary
251        // https://github.com/apache/arrow-rs/issues/8387
252        let inner = cast_to_binary_view_arrays(inner)?;
253
254        let Some(inner) = inner.as_struct_opt() else {
255            return Err(ArrowError::InvalidArgumentError(
256                "Invalid VariantArray: requires StructArray as input".to_string(),
257            ));
258        };
259
260        // Note the specification allows for any order so we must search by name
261
262        // Ensure the StructArray has a metadata field of BinaryView
263        let Some(metadata_field) = inner.column_by_name("metadata") else {
264            return Err(ArrowError::InvalidArgumentError(
265                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
266            ));
267        };
268        let Some(metadata) = metadata_field.as_binary_view_opt() else {
269            return Err(ArrowError::NotYetImplemented(format!(
270                "VariantArray 'metadata' field must be BinaryView, got {}",
271                metadata_field.data_type()
272            )));
273        };
274
275        // Note these clones are cheap, they just bump the ref count
276        Ok(Self {
277            inner: inner.clone(),
278            metadata: metadata.clone(),
279            shredding_state: ShreddingState::try_from(inner)?,
280        })
281    }
282
283    pub(crate) fn from_parts(
284        metadata: BinaryViewArray,
285        value: Option<BinaryViewArray>,
286        typed_value: Option<ArrayRef>,
287        nulls: Option<NullBuffer>,
288    ) -> Self {
289        let mut builder =
290            StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false);
291        if let Some(value) = value.clone() {
292            builder = builder.with_field("value", Arc::new(value), true);
293        }
294        if let Some(typed_value) = typed_value.clone() {
295            builder = builder.with_field("typed_value", typed_value, true);
296        }
297        if let Some(nulls) = nulls {
298            builder = builder.with_nulls(nulls);
299        }
300
301        Self {
302            inner: builder.build(),
303            metadata,
304            shredding_state: ShreddingState::new(value, typed_value),
305        }
306    }
307
308    /// Returns a reference to the underlying [`StructArray`].
309    pub fn inner(&self) -> &StructArray {
310        &self.inner
311    }
312
313    /// Returns the inner [`StructArray`], consuming self
314    pub fn into_inner(self) -> StructArray {
315        self.inner
316    }
317
318    /// Return the shredding state of this `VariantArray`
319    pub fn shredding_state(&self) -> &ShreddingState {
320        &self.shredding_state
321    }
322
323    /// Return the [`Variant`] instance stored at the given row
324    ///
325    /// Note: This method does not check for nulls and the value is arbitrary
326    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
327    ///
328    /// # Panics
329    /// * if the index is out of bounds
330    /// * if the array value is null
331    ///
332    /// If this is a shredded variant but has no value at the shredded location, it
333    /// will return [`Variant::Null`].
334    ///
335    ///
336    /// # Performance Note
337    ///
338    /// This is certainly not the most efficient way to access values in a
339    /// `VariantArray`, but it is useful for testing and debugging.
340    ///
341    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
342    /// caller to ensure that the metadata and value were constructed correctly.
343    pub fn value(&self, index: usize) -> Variant<'_, '_> {
344        match (self.typed_value_field(), self.value_field()) {
345            // Always prefer typed_value, if available
346            (Some(typed_value), value) if typed_value.is_valid(index) => {
347                typed_value_to_variant(typed_value, value, index)
348            }
349            // Otherwise fall back to value, if available
350            (_, Some(value)) if value.is_valid(index) => {
351                Variant::new(self.metadata.value(index), value.value(index))
352            }
353            // It is technically invalid for neither value nor typed_value fields to be available,
354            // but the spec specifically requires readers to return Variant::Null in this case.
355            _ => Variant::Null,
356        }
357    }
358
359    /// Return a reference to the metadata field of the [`StructArray`]
360    pub fn metadata_field(&self) -> &BinaryViewArray {
361        &self.metadata
362    }
363
364    /// Return a reference to the value field of the `StructArray`
365    pub fn value_field(&self) -> Option<&BinaryViewArray> {
366        self.shredding_state.value_field()
367    }
368
369    /// Return a reference to the typed_value field of the `StructArray`, if present
370    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
371        self.shredding_state.typed_value_field()
372    }
373
374    /// Return a field to represent this VariantArray in a `Schema` with
375    /// a particular name
376    pub fn field(&self, name: impl Into<String>) -> Field {
377        Field::new(
378            name.into(),
379            self.data_type().clone(),
380            self.inner.is_nullable(),
381        )
382        .with_extension_type(VariantType)
383    }
384
385    /// Returns a new DataType representing this VariantArray's inner type
386    pub fn data_type(&self) -> &DataType {
387        self.inner.data_type()
388    }
389
390    pub fn slice(&self, offset: usize, length: usize) -> Self {
391        let inner = self.inner.slice(offset, length);
392        let metadata = self.metadata.slice(offset, length);
393        let shredding_state = self.shredding_state.slice(offset, length);
394        Self {
395            inner,
396            metadata,
397            shredding_state,
398        }
399    }
400
401    pub fn len(&self) -> usize {
402        self.inner.len()
403    }
404
405    pub fn is_empty(&self) -> bool {
406        self.inner.is_empty()
407    }
408
409    pub fn nulls(&self) -> Option<&NullBuffer> {
410        self.inner.nulls()
411    }
412
413    /// Is the element at index null?
414    pub fn is_null(&self, index: usize) -> bool {
415        self.nulls().is_some_and(|n| n.is_null(index))
416    }
417
418    /// Is the element at index valid (not null)?
419    pub fn is_valid(&self, index: usize) -> bool {
420        !self.is_null(index)
421    }
422}
423
424impl From<VariantArray> for StructArray {
425    fn from(variant_array: VariantArray) -> Self {
426        variant_array.into_inner()
427    }
428}
429
430impl From<VariantArray> for ArrayRef {
431    fn from(variant_array: VariantArray) -> Self {
432        Arc::new(variant_array.into_inner())
433    }
434}
435
436/// One shredded field of a partially or prefectly shredded variant. For example, suppose the
437/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
438/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
439/// is:
440///
441/// ```text
442/// v: VARIANT {
443///     metadata: BINARY,
444///     value: BINARY,
445///     typed_value: STRUCT {
446///         a: SHREDDED_VARIANT_FIELD {
447///             value: BINARY,
448///             typed_value: STRUCT {
449///                 a: SHREDDED_VARIANT_FIELD {
450///                     value: BINARY,
451///                     typed_value: INT,
452///                 },
453///             },
454///         },
455///     },
456/// }
457/// ```
458///
459/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
460/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
461/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
462/// single expected field `a`).
463///
464/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
465/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
466/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
467///
468/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
469/// variant value (which could be `Variant::Null`).
470#[derive(Debug)]
471pub struct ShreddedVariantFieldArray {
472    /// Reference to the underlying StructArray
473    inner: StructArray,
474    shredding_state: ShreddingState,
475}
476
477#[allow(unused)]
478impl ShreddedVariantFieldArray {
479    /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
480    ///
481    /// # Arguments
482    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
483    ///
484    /// # Returns
485    /// - A new instance of `ShreddedVariantFieldArray`.
486    ///
487    /// # Errors:
488    /// - If the `StructArray` does not contain the required fields
489    ///
490    /// # Requirements of the `StructArray`
491    ///
492    /// 1. An optional field named `value` that is binary, large_binary, or
493    ///    binary_view
494    ///
495    /// 2. An optional field named `typed_value` which can be any primitive type
496    ///    or be a list, large_list, list_view or struct
497    ///
498    /// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
499    pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
500        let Some(inner_struct) = inner.as_struct_opt() else {
501            return Err(ArrowError::InvalidArgumentError(
502                "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
503            ));
504        };
505
506        // Note this clone is cheap, it just bumps the ref count
507        Ok(Self {
508            inner: inner_struct.clone(),
509            shredding_state: ShreddingState::try_from(inner_struct)?,
510        })
511    }
512
513    /// Return the shredding state of this `VariantArray`
514    pub fn shredding_state(&self) -> &ShreddingState {
515        &self.shredding_state
516    }
517
518    /// Return a reference to the value field of the `StructArray`
519    pub fn value_field(&self) -> Option<&BinaryViewArray> {
520        self.shredding_state.value_field()
521    }
522
523    /// Return a reference to the typed_value field of the `StructArray`, if present
524    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
525        self.shredding_state.typed_value_field()
526    }
527
528    /// Returns a reference to the underlying [`StructArray`].
529    pub fn inner(&self) -> &StructArray {
530        &self.inner
531    }
532
533    pub(crate) fn from_parts(
534        value: Option<BinaryViewArray>,
535        typed_value: Option<ArrayRef>,
536        nulls: Option<NullBuffer>,
537    ) -> Self {
538        let mut builder = StructArrayBuilder::new();
539        if let Some(value) = value.clone() {
540            builder = builder.with_field("value", Arc::new(value), true);
541        }
542        if let Some(typed_value) = typed_value.clone() {
543            builder = builder.with_field("typed_value", typed_value, true);
544        }
545        if let Some(nulls) = nulls {
546            builder = builder.with_nulls(nulls);
547        }
548
549        Self {
550            inner: builder.build(),
551            shredding_state: ShreddingState::new(value, typed_value),
552        }
553    }
554
555    /// Returns the inner [`StructArray`], consuming self
556    pub fn into_inner(self) -> StructArray {
557        self.inner
558    }
559
560    pub fn data_type(&self) -> &DataType {
561        self.inner.data_type()
562    }
563
564    pub fn len(&self) -> usize {
565        self.inner.len()
566    }
567
568    pub fn is_empty(&self) -> bool {
569        self.inner.is_empty()
570    }
571
572    pub fn offset(&self) -> usize {
573        self.inner.offset()
574    }
575
576    pub fn nulls(&self) -> Option<&NullBuffer> {
577        // According to the shredding spec, ShreddedVariantFieldArray should be
578        // physically non-nullable - SQL NULL is inferred by both value and
579        // typed_value being physically NULL
580        None
581    }
582    /// Is the element at index null?
583    pub fn is_null(&self, index: usize) -> bool {
584        self.nulls().is_some_and(|n| n.is_null(index))
585    }
586
587    /// Is the element at index valid (not null)?
588    pub fn is_valid(&self, index: usize) -> bool {
589        !self.is_null(index)
590    }
591}
592
593impl From<ShreddedVariantFieldArray> for ArrayRef {
594    fn from(array: ShreddedVariantFieldArray) -> Self {
595        Arc::new(array.into_inner())
596    }
597}
598
599impl From<ShreddedVariantFieldArray> for StructArray {
600    fn from(array: ShreddedVariantFieldArray) -> Self {
601        array.into_inner()
602    }
603}
604
605/// Represents the shredding state of a [`VariantArray`]
606///
607/// [`VariantArray`]s can be shredded according to the [Parquet Variant
608/// Shredding Spec]. Shredding means that the actual value is stored in a typed
609/// `typed_field` instead of the generic `value` field.
610///
611/// Both value and typed_value are optional fields used together to encode a
612/// single value. Values in the two fields must be interpreted according to the
613/// following table (see [Parquet Variant Shredding Spec] for more details):
614///
615/// | value    | typed_value  | Meaning |
616/// |----------|--------------|---------|
617/// | NULL     | NULL         | The value is missing; only valid for shredded object fields |
618/// | non-NULL | NULL         | The value is present and may be any type, including [`Variant::Null`] |
619/// | NULL     | non-NULL     | The value is present and is the shredded type |
620/// | non-NULL | non-NULL     | The value is present and is a partially shredded object |
621///
622///
623/// Applying the above rules to entire columns, we obtain the following:
624///
625/// | value  | typed_value  | Meaning |
626/// |--------|-------------|---------|
627/// | --     | --          | **Missing**: The value is always missing; only valid for shredded object fields |
628/// | exists | --          | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
629/// | --     | exists      | **Perfectly shredded**: If present, the value is always the shredded type |
630/// | exists | exists      | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
631///
632/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
633/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
634/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
635/// (partial shredding).
636///
637/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
638#[derive(Clone, Debug)]
639pub struct ShreddingState {
640    value: Option<BinaryViewArray>,
641    typed_value: Option<ArrayRef>,
642}
643
644impl ShreddingState {
645    /// Create a new `ShreddingState` from the given `value` and `typed_value` fields
646    ///
647    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
648    /// `ShreddingState::try_from(&struct_array)`, for example:
649    ///
650    /// ```no_run
651    /// # use arrow::array::StructArray;
652    /// # use parquet_variant_compute::ShreddingState;
653    /// # fn get_struct_array() -> StructArray {
654    /// #   unimplemented!()
655    /// # }
656    /// let struct_array: StructArray = get_struct_array();
657    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
658    /// ```
659    pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
660        Self { value, typed_value }
661    }
662
663    /// Return a reference to the value field, if present
664    pub fn value_field(&self) -> Option<&BinaryViewArray> {
665        self.value.as_ref()
666    }
667
668    /// Return a reference to the typed_value field, if present
669    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
670        self.typed_value.as_ref()
671    }
672
673    /// Returns a borrowed version of this shredding state
674    pub fn borrow(&self) -> BorrowedShreddingState<'_> {
675        BorrowedShreddingState {
676            value: self.value_field(),
677            typed_value: self.typed_value_field(),
678        }
679    }
680
681    /// Slice all the underlying arrays
682    pub fn slice(&self, offset: usize, length: usize) -> Self {
683        Self {
684            value: self.value.as_ref().map(|v| v.slice(offset, length)),
685            typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
686        }
687    }
688}
689
690/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
691/// for avoiding clone operations when the caller does not need a self-standing shredding state.
692#[derive(Clone, Debug)]
693pub struct BorrowedShreddingState<'a> {
694    value: Option<&'a BinaryViewArray>,
695    typed_value: Option<&'a ArrayRef>,
696}
697
698impl<'a> BorrowedShreddingState<'a> {
699    /// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
700    ///
701    /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
702    /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
703    ///
704    /// ```no_run
705    /// # use arrow::array::StructArray;
706    /// # use parquet_variant_compute::BorrowedShreddingState;
707    /// # fn get_struct_array() -> StructArray {
708    /// #   unimplemented!()
709    /// # }
710    /// let struct_array: StructArray = get_struct_array();
711    /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
712    /// ```
713    pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
714        Self { value, typed_value }
715    }
716
717    /// Return a reference to the value field, if present
718    pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
719        self.value
720    }
721
722    /// Return a reference to the typed_value field, if present
723    pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
724        self.typed_value
725    }
726}
727
728impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
729    type Error = ArrowError;
730
731    fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
732        // The `value` column need not exist, but if it does it must be a binary view.
733        let value = if let Some(value_col) = inner_struct.column_by_name("value") {
734            let Some(binary_view) = value_col.as_binary_view_opt() else {
735                return Err(ArrowError::NotYetImplemented(format!(
736                    "VariantArray 'value' field must be BinaryView, got {}",
737                    value_col.data_type()
738                )));
739            };
740            Some(binary_view)
741        } else {
742            None
743        };
744        let typed_value = inner_struct.column_by_name("typed_value");
745        Ok(BorrowedShreddingState::new(value, typed_value))
746    }
747}
748
749impl TryFrom<&StructArray> for ShreddingState {
750    type Error = ArrowError;
751
752    fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
753        Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
754    }
755}
756
757impl From<BorrowedShreddingState<'_>> for ShreddingState {
758    fn from(state: BorrowedShreddingState<'_>) -> Self {
759        ShreddingState {
760            value: state.value_field().cloned(),
761            typed_value: state.typed_value_field().cloned(),
762        }
763    }
764}
765
766/// Builds struct arrays from component fields
767///
768/// TODO: move to arrow crate
769#[derive(Debug, Default, Clone)]
770pub(crate) struct StructArrayBuilder {
771    fields: Vec<FieldRef>,
772    arrays: Vec<ArrayRef>,
773    nulls: Option<NullBuffer>,
774}
775
776impl StructArrayBuilder {
777    pub fn new() -> Self {
778        Default::default()
779    }
780
781    /// Add an array to this struct array as a field with the specified name.
782    pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
783        let field = Field::new(field_name, array.data_type().clone(), nullable);
784        self.fields.push(Arc::new(field));
785        self.arrays.push(array);
786        self
787    }
788
789    /// Set the null buffer for this struct array.
790    pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
791        self.nulls = Some(nulls);
792        self
793    }
794
795    pub fn build(self) -> StructArray {
796        let Self {
797            fields,
798            arrays,
799            nulls,
800        } = self;
801        StructArray::new(Fields::from(fields), arrays, nulls)
802    }
803}
804
805/// returns the non-null element at index as a Variant
806fn typed_value_to_variant<'a>(
807    typed_value: &'a ArrayRef,
808    value: Option<&BinaryViewArray>,
809    index: usize,
810) -> Variant<'a, 'a> {
811    let data_type = typed_value.data_type();
812    if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
813        // Only a partially shredded struct is allowed to have values for both columns
814        panic!("Invalid variant, conflicting value and typed_value");
815    }
816    match data_type {
817        DataType::Boolean => {
818            let boolean_array = typed_value.as_boolean();
819            let value = boolean_array.value(index);
820            Variant::from(value)
821        }
822        DataType::Date32 => {
823            let array = typed_value.as_primitive::<Date32Type>();
824            let value = array.value(index);
825            let date = Date32Type::to_naive_date(value);
826            Variant::from(date)
827        }
828        // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
829        DataType::FixedSizeBinary(16) => {
830            let array = typed_value.as_fixed_size_binary();
831            let value = array.value(index);
832            Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice is always 16 bytes
833        }
834        DataType::BinaryView => {
835            let array = typed_value.as_binary_view();
836            let value = array.value(index);
837            Variant::from(value)
838        }
839        DataType::Utf8 => {
840            let array = typed_value.as_string::<i32>();
841            let value = array.value(index);
842            Variant::from(value)
843        }
844        DataType::Int8 => {
845            primitive_conversion_single_value!(Int8Type, typed_value, index)
846        }
847        DataType::Int16 => {
848            primitive_conversion_single_value!(Int16Type, typed_value, index)
849        }
850        DataType::Int32 => {
851            primitive_conversion_single_value!(Int32Type, typed_value, index)
852        }
853        DataType::Int64 => {
854            primitive_conversion_single_value!(Int64Type, typed_value, index)
855        }
856        DataType::Float16 => {
857            primitive_conversion_single_value!(Float16Type, typed_value, index)
858        }
859        DataType::Float32 => {
860            primitive_conversion_single_value!(Float32Type, typed_value, index)
861        }
862        DataType::Float64 => {
863            primitive_conversion_single_value!(Float64Type, typed_value, index)
864        }
865        DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
866            generic_conversion_single_value!(
867                TimestampMicrosecondType,
868                as_primitive,
869                |v| DateTime::from_timestamp_micros(v).unwrap(),
870                typed_value,
871                index
872            )
873        }
874        DataType::Timestamp(TimeUnit::Microsecond, None) => {
875            generic_conversion_single_value!(
876                TimestampMicrosecondType,
877                as_primitive,
878                |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
879                typed_value,
880                index
881            )
882        }
883        DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
884            generic_conversion_single_value!(
885                TimestampNanosecondType,
886                as_primitive,
887                DateTime::from_timestamp_nanos,
888                typed_value,
889                index
890            )
891        }
892        DataType::Timestamp(TimeUnit::Nanosecond, None) => {
893            generic_conversion_single_value!(
894                TimestampNanosecondType,
895                as_primitive,
896                |v| DateTime::from_timestamp_nanos(v).naive_utc(),
897                typed_value,
898                index
899            )
900        }
901        // todo other types here (note this is very similar to cast_to_variant.rs)
902        // so it would be great to figure out how to share this code
903        _ => {
904            // We shouldn't panic in production code, but this is a
905            // placeholder until we implement more types
906            // https://github.com/apache/arrow-rs/issues/8091
907            debug_assert!(
908                false,
909                "Unsupported typed_value type: {}",
910                typed_value.data_type()
911            );
912            Variant::Null
913        }
914    }
915}
916
917/// Workaround for lack of direct support for BinaryArray
918/// <https://github.com/apache/arrow-rs/issues/8387>
919///
920/// The values are read as
921/// * `StructArray<metadata: Binary, value: Binary>`
922///
923/// but VariantArray needs them as
924/// * `StructArray<metadata: BinaryView, value: BinaryView>`
925///
926/// So cast them to get the right type.
927fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
928    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
929    cast(array, new_type.as_ref())
930}
931
932/// Validates whether a given arrow decimal is a valid variant decimal
933///
934/// NOTE: By a strict reading of the "decimal table" in the [shredding spec], each decimal type
935/// should have a width-dependent lower bound on precision as well as an upper bound (i.e. Decimal16
936/// with precision 5 is invalid because Decimal4 "covers" it). But the variant shredding integration
937/// tests specifically expect such cases to succeed, so we only enforce the upper bound here.
938///
939/// [shredding spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
940fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
941    (1..=max_precision).contains(p) && (0..=*p as i8).contains(s)
942}
943
944/// Recursively visits a data type, ensuring that it only contains data types that can legally
945/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView,
946/// since that's what comes back from the parquet reader and what the variant code expects to find.
947fn canonicalize_and_verify_data_type(
948    data_type: &DataType,
949) -> Result<Cow<'_, DataType>, ArrowError> {
950    use DataType::*;
951
952    // helper macros
953    macro_rules! fail {
954        () => {
955            return Err(ArrowError::InvalidArgumentError(format!(
956                "Illegal shredded value type: {data_type}"
957            )))
958        };
959    }
960    macro_rules! borrow {
961        () => {
962            Cow::Borrowed(data_type)
963        };
964    }
965
966    let new_data_type = match data_type {
967        // Primitive arrow types that have a direct variant counterpart are allowed
968        Null | Boolean => borrow!(),
969        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
970
971        // Unsigned integers and half-float are not allowed
972        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
973
974        // Most decimal types are allowed, with restrictions on precision and scale
975        Decimal32(p, s) if is_valid_variant_decimal(p, s, 9) => borrow!(),
976        Decimal64(p, s) if is_valid_variant_decimal(p, s, 18) => borrow!(),
977        Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(),
978        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
979
980        // Only micro and nano timestamps are allowed
981        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
982        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
983
984        // Only 32-bit dates and 64-bit microsecond time are allowed.
985        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
986        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
987
988        // Binary and string are allowed. Force Binary to BinaryView because that's what the parquet
989        // reader returns and what the rest of the variant code expects.
990        Binary => Cow::Owned(DataType::BinaryView),
991        BinaryView | Utf8 => borrow!(),
992
993        // UUID maps to 16-byte fixed-size binary; no other width is allowed
994        FixedSizeBinary(16) => borrow!(),
995        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
996
997        // We can _possibly_ allow (some of) these some day?
998        LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => {
999            fail!()
1000        }
1001
1002        // Lists and struct are allowed, maps and unions are not
1003        List(field) => match canonicalize_and_verify_field(field)? {
1004            Cow::Borrowed(_) => borrow!(),
1005            Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1006        },
1007        // Struct is used by the internal layout, and can also represent a shredded variant object.
1008        Struct(fields) => {
1009            // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning
1010            // of the data type. Even if some fields change, the others are shallow arc clones.
1011            let mut new_fields = std::collections::HashMap::new();
1012            for (i, field) in fields.iter().enumerate() {
1013                if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1014                    new_fields.insert(i, new_field);
1015                }
1016            }
1017
1018            if new_fields.is_empty() {
1019                borrow!()
1020            } else {
1021                let new_fields = fields
1022                    .iter()
1023                    .enumerate()
1024                    .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1025                Cow::Owned(DataType::Struct(new_fields.collect()))
1026            }
1027        }
1028        Map(..) | Union(..) => fail!(),
1029
1030        // We can _possibly_ support (some of) these some day?
1031        Dictionary(..) | RunEndEncoded(..) => fail!(),
1032    };
1033    Ok(new_data_type)
1034}
1035
1036fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>, ArrowError> {
1037    let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1038        return Ok(Cow::Borrowed(field));
1039    };
1040    let new_field = field.as_ref().clone().with_data_type(new_data_type);
1041    Ok(Cow::Owned(Arc::new(new_field)))
1042}
1043
1044#[cfg(test)]
1045mod test {
1046    use super::*;
1047    use arrow::array::{BinaryViewArray, Int32Array};
1048    use arrow_schema::{Field, Fields};
1049
1050    #[test]
1051    fn invalid_not_a_struct_array() {
1052        let array = make_binary_view_array();
1053        // Should fail because the input is not a StructArray
1054        let err = VariantArray::try_new(&array);
1055        assert_eq!(
1056            err.unwrap_err().to_string(),
1057            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1058        );
1059    }
1060
1061    #[test]
1062    fn invalid_missing_metadata() {
1063        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1064        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1065        // Should fail because the StructArray does not contain a 'metadata' field
1066        let err = VariantArray::try_new(&array);
1067        assert_eq!(
1068            err.unwrap_err().to_string(),
1069            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1070        );
1071    }
1072
1073    #[test]
1074    fn all_null_missing_value_and_typed_value() {
1075        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1076        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1077
1078        // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
1079        // should be invalid, but we currently allow it and treat it as Variant::Null.
1080        // This is a pragmatic decision to handle missing data gracefully.
1081        let variant_array = VariantArray::try_new(&array).unwrap();
1082
1083        // Verify the shredding state is AllNull
1084        assert!(matches!(
1085            variant_array.shredding_state(),
1086            ShreddingState {
1087                value: None,
1088                typed_value: None
1089            }
1090        ));
1091
1092        // Verify that value() returns Variant::Null (compensating for spec violation)
1093        for i in 0..variant_array.len() {
1094            if variant_array.is_valid(i) {
1095                assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1096            }
1097        }
1098    }
1099
1100    #[test]
1101    fn invalid_metadata_field_type() {
1102        let fields = Fields::from(vec![
1103            Field::new("metadata", DataType::Int32, true), // not supported
1104            Field::new("value", DataType::BinaryView, true),
1105        ]);
1106        let array = StructArray::new(
1107            fields,
1108            vec![make_int32_array(), make_binary_view_array()],
1109            None,
1110        );
1111        let err = VariantArray::try_new(&array);
1112        assert_eq!(
1113            err.unwrap_err().to_string(),
1114            "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32"
1115        );
1116    }
1117
1118    #[test]
1119    fn invalid_value_field_type() {
1120        let fields = Fields::from(vec![
1121            Field::new("metadata", DataType::BinaryView, true),
1122            Field::new("value", DataType::Int32, true), // Not yet supported
1123        ]);
1124        let array = StructArray::new(
1125            fields,
1126            vec![make_binary_view_array(), make_int32_array()],
1127            None,
1128        );
1129        let err = VariantArray::try_new(&array);
1130        assert_eq!(
1131            err.unwrap_err().to_string(),
1132            "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32"
1133        );
1134    }
1135
1136    fn make_binary_view_array() -> ArrayRef {
1137        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1138    }
1139
1140    fn make_int32_array() -> ArrayRef {
1141        Arc::new(Int32Array::from(vec![1]))
1142    }
1143
1144    #[test]
1145    fn all_null_shredding_state() {
1146        // Verify the shredding state is AllNull
1147        assert!(matches!(
1148            ShreddingState::new(None, None),
1149            ShreddingState {
1150                value: None,
1151                typed_value: None
1152            }
1153        ));
1154    }
1155
1156    #[test]
1157    fn all_null_variant_array_construction() {
1158        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1159        let nulls = NullBuffer::from(vec![false, false, false]); // all null
1160
1161        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1162        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1163
1164        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1165
1166        // Verify the shredding state is AllNull
1167        assert!(matches!(
1168            variant_array.shredding_state(),
1169            ShreddingState {
1170                value: None,
1171                typed_value: None
1172            }
1173        ));
1174
1175        // Verify all values are null
1176        assert_eq!(variant_array.len(), 3);
1177        assert!(!variant_array.is_valid(0));
1178        assert!(!variant_array.is_valid(1));
1179        assert!(!variant_array.is_valid(2));
1180
1181        // Verify that value() returns Variant::Null for all indices
1182        for i in 0..variant_array.len() {
1183            assert!(
1184                !variant_array.is_valid(i),
1185                "Expected value at index {i} to be null"
1186            );
1187        }
1188    }
1189
1190    #[test]
1191    fn value_field_present_but_all_null_should_be_unshredded() {
1192        // This test demonstrates the issue: when a value field exists in schema
1193        // but all its values are null, it should remain Unshredded, not AllNull
1194        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1195
1196        // Create a value field with all null values
1197        let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
1198        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1199        let value_data = value_array
1200            .to_data()
1201            .into_builder()
1202            .nulls(Some(value_nulls))
1203            .build()
1204            .unwrap();
1205        let value = BinaryViewArray::from(value_data);
1206
1207        let fields = Fields::from(vec![
1208            Field::new("metadata", DataType::BinaryView, false),
1209            Field::new("value", DataType::BinaryView, true), // Field exists in schema
1210        ]);
1211        let struct_array = StructArray::new(
1212            fields,
1213            vec![Arc::new(metadata), Arc::new(value)],
1214            None, // struct itself is not null, just the value field is all null
1215        );
1216
1217        let variant_array = VariantArray::try_new(&struct_array).unwrap();
1218
1219        // This should be Unshredded, not AllNull, because value field exists in schema
1220        assert!(matches!(
1221            variant_array.shredding_state(),
1222            ShreddingState {
1223                value: Some(_),
1224                typed_value: None
1225            }
1226        ));
1227    }
1228}