parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray};
21use arrow::buffer::NullBuffer;
22use arrow::datatypes::{Int16Type, Int32Type};
23use arrow_schema::{ArrowError, DataType};
24use parquet_variant::Variant;
25use std::any::Any;
26use std::sync::Arc;
27
28use crate::type_conversion::primitive_conversion_single_value;
29
30/// An array of Parquet [`Variant`] values
31///
32/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
33/// `metadata` and `value` fields, and adds convenience methods to access
34/// the `Variant`s
35///
36/// See [`VariantArrayBuilder`] for constructing a `VariantArray`.
37///
38/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
39///
40/// # Specification
41///
42/// 1. This code follows the conventions for storing variants in Arrow `StructArray`
43///    defined by [Extension Type for Parquet Variant arrow] and this [document].
44///    At the time of this writing, this is not yet a standardized Arrow extension type.
45///
46/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
47/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
48#[derive(Debug)]
49pub struct VariantArray {
50    /// Reference to the underlying StructArray
51    inner: StructArray,
52
53    /// how is this variant array shredded?
54    shredding_state: ShreddingState,
55}
56
57impl VariantArray {
58    /// Creates a new `VariantArray` from a [`StructArray`].
59    ///
60    /// # Arguments
61    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
62    ///
63    /// # Returns
64    /// - A new instance of `VariantArray`.
65    ///
66    /// # Errors:
67    /// - If the `StructArray` does not contain the required fields
68    ///
69    /// # Requirements of the `StructArray`
70    ///
71    /// 1. A required field named `metadata` which is binary, large_binary, or
72    ///    binary_view
73    ///
74    /// 2. An optional field named `value` that is binary, large_binary, or
75    ///    binary_view
76    ///
77    /// 3. An optional field named `typed_value` which can be any primitive type
78    ///    or be a list, large_list, list_view or struct
79    ///
80    /// NOTE: It is also permissible for the metadata field to be
81    /// Dictionary-Encoded, preferably (but not required) with an index type of
82    /// int8.
83    ///
84    /// Currently, only [`BinaryViewArray`] are supported.
85    pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
86        let Some(inner) = inner.as_struct_opt() else {
87            return Err(ArrowError::InvalidArgumentError(
88                "Invalid VariantArray: requires StructArray as input".to_string(),
89            ));
90        };
91
92        // Note the specification allows for any order so we must search by name
93
94        // Ensure the StructArray has a metadata field of BinaryView
95        let Some(metadata_field) = inner.column_by_name("metadata") else {
96            return Err(ArrowError::InvalidArgumentError(
97                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
98            ));
99        };
100        let Some(metadata) = metadata_field.as_binary_view_opt() else {
101            return Err(ArrowError::NotYetImplemented(format!(
102                "VariantArray 'metadata' field must be BinaryView, got {}",
103                metadata_field.data_type()
104            )));
105        };
106
107        // Find the value field, if present
108        let value = inner
109            .column_by_name("value")
110            .map(|v| {
111                v.as_binary_view_opt().ok_or_else(|| {
112                    ArrowError::NotYetImplemented(format!(
113                        "VariantArray 'value' field must be BinaryView, got {}",
114                        v.data_type()
115                    ))
116                })
117            })
118            .transpose()?;
119
120        // Find the typed_value field, if present
121        let typed_value = inner.column_by_name("typed_value");
122
123        // Note these clones are cheap, they just bump the ref count
124        let inner = inner.clone();
125        let shredding_state =
126            ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?;
127
128        Ok(Self {
129            inner,
130            shredding_state,
131        })
132    }
133
134    /// Returns a reference to the underlying [`StructArray`].
135    pub fn inner(&self) -> &StructArray {
136        &self.inner
137    }
138
139    /// Returns the inner [`StructArray`], consuming self
140    pub fn into_inner(self) -> StructArray {
141        self.inner
142    }
143
144    /// Return the shredding state of this `VariantArray`
145    pub fn shredding_state(&self) -> &ShreddingState {
146        &self.shredding_state
147    }
148
149    /// Return the [`Variant`] instance stored at the given row
150    ///
151    /// Note: This method does not check for nulls and the value is arbitrary
152    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
153    ///
154    /// # Panics
155    /// * if the index is out of bounds
156    /// * if the array value is null
157    ///
158    /// If this is a shredded variant but has no value at the shredded location, it
159    /// will return [`Variant::Null`].
160    ///
161    ///
162    /// # Performance Note
163    ///
164    /// This is certainly not the most efficient way to access values in a
165    /// `VariantArray`, but it is useful for testing and debugging.
166    ///
167    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
168    /// caller to ensure that the metadata and value were constructed correctly.
169    pub fn value(&self, index: usize) -> Variant<'_, '_> {
170        match &self.shredding_state {
171            ShreddingState::Unshredded { metadata, value } => {
172                Variant::new(metadata.value(index), value.value(index))
173            }
174            ShreddingState::Typed { typed_value, .. } => {
175                if typed_value.is_null(index) {
176                    Variant::Null
177                } else {
178                    typed_value_to_variant(typed_value, index)
179                }
180            }
181            ShreddingState::PartiallyShredded {
182                metadata,
183                value,
184                typed_value,
185            } => {
186                if typed_value.is_null(index) {
187                    Variant::new(metadata.value(index), value.value(index))
188                } else {
189                    typed_value_to_variant(typed_value, index)
190                }
191            }
192            ShreddingState::AllNull { .. } => {
193                // NOTE: This handles the case where neither value nor typed_value fields exist.
194                // For top-level variants, this returns Variant::Null (JSON null).
195                // For shredded object fields, this technically should indicate SQL NULL,
196                // but the current API cannot distinguish these contexts.
197                Variant::Null
198            }
199        }
200    }
201
202    /// Return a reference to the metadata field of the [`StructArray`]
203    pub fn metadata_field(&self) -> &BinaryViewArray {
204        self.shredding_state.metadata_field()
205    }
206
207    /// Return a reference to the value field of the `StructArray`
208    pub fn value_field(&self) -> Option<&BinaryViewArray> {
209        self.shredding_state.value_field()
210    }
211
212    /// Return a reference to the typed_value field of the `StructArray`, if present
213    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
214        self.shredding_state.typed_value_field()
215    }
216}
217
218/// Represents the shredding state of a [`VariantArray`]
219///
220/// [`VariantArray`]s can be shredded according to the [Parquet Variant
221/// Shredding Spec]. Shredding means that the actual value is stored in a typed
222/// `typed_field` instead of the generic `value` field.
223///
224/// Both value and typed_value are optional fields used together to encode a
225/// single value. Values in the two fields must be interpreted according to the
226/// following table (see [Parquet Variant Shredding Spec] for more details):
227///
228/// | value | typed_value | Meaning |
229/// |----------|--------------|---------|
230/// | null     | null         | The value is missing; only valid for shredded object fields |
231/// | non-null | null         | The value is present and may be any type, including `null` |
232/// | null     | non-null     | The value is present and is the shredded type |
233/// | non-null | non-null     | The value is present and is a partially shredded object |
234///
235/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
236#[derive(Debug)]
237pub enum ShreddingState {
238    /// This variant has no typed_value field
239    Unshredded {
240        metadata: BinaryViewArray,
241        value: BinaryViewArray,
242    },
243    /// This variant has a typed_value field and no value field
244    /// meaning it is the shredded type
245    Typed {
246        metadata: BinaryViewArray,
247        typed_value: ArrayRef,
248    },
249    /// Partially shredded:
250    /// * value is an object
251    /// * typed_value is a shredded object.
252    ///
253    /// Note the spec says "Writers must not produce data where both value and
254    /// typed_value are non-null, unless the Variant value is an object."
255    PartiallyShredded {
256        metadata: BinaryViewArray,
257        value: BinaryViewArray,
258        typed_value: ArrayRef,
259    },
260    /// All values are null, only metadata is present.
261    ///
262    /// This state occurs when neither `value` nor `typed_value` fields exist in the schema.
263    /// Note: By strict spec interpretation, this should only be valid for shredded object fields,
264    /// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic
265    /// handling of missing data.
266    AllNull { metadata: BinaryViewArray },
267}
268
269impl ShreddingState {
270    /// try to create a new `ShreddingState` from the given fields
271    pub fn try_new(
272        metadata: BinaryViewArray,
273        value: Option<BinaryViewArray>,
274        typed_value: Option<ArrayRef>,
275    ) -> Result<Self, ArrowError> {
276        match (metadata, value, typed_value) {
277            (metadata, Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded {
278                metadata,
279                value,
280                typed_value,
281            }),
282            (metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }),
283            (metadata, None, Some(typed_value)) => Ok(Self::Typed {
284                metadata,
285                typed_value,
286            }),
287            (metadata, None, None) => Ok(Self::AllNull { metadata }),
288        }
289    }
290
291    /// Return a reference to the metadata field
292    pub fn metadata_field(&self) -> &BinaryViewArray {
293        match self {
294            ShreddingState::Unshredded { metadata, .. } => metadata,
295            ShreddingState::Typed { metadata, .. } => metadata,
296            ShreddingState::PartiallyShredded { metadata, .. } => metadata,
297            ShreddingState::AllNull { metadata } => metadata,
298        }
299    }
300
301    /// Return a reference to the value field, if present
302    pub fn value_field(&self) -> Option<&BinaryViewArray> {
303        match self {
304            ShreddingState::Unshredded { value, .. } => Some(value),
305            ShreddingState::Typed { .. } => None,
306            ShreddingState::PartiallyShredded { value, .. } => Some(value),
307            ShreddingState::AllNull { .. } => None,
308        }
309    }
310
311    /// Return a reference to the typed_value field, if present
312    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
313        match self {
314            ShreddingState::Unshredded { .. } => None,
315            ShreddingState::Typed { typed_value, .. } => Some(typed_value),
316            ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
317            ShreddingState::AllNull { .. } => None,
318        }
319    }
320
321    /// Slice all the underlying arrays
322    pub fn slice(&self, offset: usize, length: usize) -> Self {
323        match self {
324            ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded {
325                metadata: metadata.slice(offset, length),
326                value: value.slice(offset, length),
327            },
328            ShreddingState::Typed {
329                metadata,
330                typed_value,
331            } => ShreddingState::Typed {
332                metadata: metadata.slice(offset, length),
333                typed_value: typed_value.slice(offset, length),
334            },
335            ShreddingState::PartiallyShredded {
336                metadata,
337                value,
338                typed_value,
339            } => ShreddingState::PartiallyShredded {
340                metadata: metadata.slice(offset, length),
341                value: value.slice(offset, length),
342                typed_value: typed_value.slice(offset, length),
343            },
344            ShreddingState::AllNull { metadata } => ShreddingState::AllNull {
345                metadata: metadata.slice(offset, length),
346            },
347        }
348    }
349}
350
351/// returns the non-null element at index as a Variant
352fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> {
353    match typed_value.data_type() {
354        DataType::Int32 => {
355            primitive_conversion_single_value!(Int32Type, typed_value, index)
356        }
357        DataType::Int16 => {
358            primitive_conversion_single_value!(Int16Type, typed_value, index)
359        }
360        // todo other types here (note this is very similar to cast_to_variant.rs)
361        // so it would be great to figure out how to share this code
362        _ => {
363            // We shouldn't panic in production code, but this is a
364            // placeholder until we implement more types
365            // https://github.com/apache/arrow-rs/issues/8091
366            debug_assert!(
367                false,
368                "Unsupported typed_value type: {:?}",
369                typed_value.data_type()
370            );
371            Variant::Null
372        }
373    }
374}
375
376impl Array for VariantArray {
377    fn as_any(&self) -> &dyn Any {
378        self
379    }
380
381    fn to_data(&self) -> ArrayData {
382        self.inner.to_data()
383    }
384
385    fn into_data(self) -> ArrayData {
386        self.inner.into_data()
387    }
388
389    fn data_type(&self) -> &DataType {
390        self.inner.data_type()
391    }
392
393    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
394        let inner = self.inner.slice(offset, length);
395        let shredding_state = self.shredding_state.slice(offset, length);
396        Arc::new(Self {
397            inner,
398            shredding_state,
399        })
400    }
401
402    fn len(&self) -> usize {
403        self.inner.len()
404    }
405
406    fn is_empty(&self) -> bool {
407        self.inner.is_empty()
408    }
409
410    fn offset(&self) -> usize {
411        self.inner.offset()
412    }
413
414    fn nulls(&self) -> Option<&NullBuffer> {
415        self.inner.nulls()
416    }
417
418    fn get_buffer_memory_size(&self) -> usize {
419        self.inner.get_buffer_memory_size()
420    }
421
422    fn get_array_memory_size(&self) -> usize {
423        self.inner.get_array_memory_size()
424    }
425}
426
427#[cfg(test)]
428mod test {
429    use super::*;
430    use arrow::array::{BinaryArray, BinaryViewArray};
431    use arrow_schema::{Field, Fields};
432
433    #[test]
434    fn invalid_not_a_struct_array() {
435        let array = make_binary_view_array();
436        // Should fail because the input is not a StructArray
437        let err = VariantArray::try_new(array);
438        assert_eq!(
439            err.unwrap_err().to_string(),
440            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
441        );
442    }
443
444    #[test]
445    fn invalid_missing_metadata() {
446        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
447        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
448        // Should fail because the StructArray does not contain a 'metadata' field
449        let err = VariantArray::try_new(Arc::new(array));
450        assert_eq!(
451            err.unwrap_err().to_string(),
452            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
453        );
454    }
455
456    #[test]
457    fn all_null_missing_value_and_typed_value() {
458        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
459        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
460
461        // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
462        // should be invalid, but we currently allow it and treat it as Variant::Null.
463        // This is a pragmatic decision to handle missing data gracefully.
464        let variant_array = VariantArray::try_new(Arc::new(array)).unwrap();
465
466        // Verify the shredding state is AllNull
467        assert!(matches!(
468            variant_array.shredding_state(),
469            ShreddingState::AllNull { .. }
470        ));
471
472        // Verify that value() returns Variant::Null (compensating for spec violation)
473        for i in 0..variant_array.len() {
474            if variant_array.is_valid(i) {
475                assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
476            }
477        }
478    }
479
480    #[test]
481    fn invalid_metadata_field_type() {
482        let fields = Fields::from(vec![
483            Field::new("metadata", DataType::Binary, true), // Not yet supported
484            Field::new("value", DataType::BinaryView, true),
485        ]);
486        let array = StructArray::new(
487            fields,
488            vec![make_binary_array(), make_binary_view_array()],
489            None,
490        );
491        let err = VariantArray::try_new(Arc::new(array));
492        assert_eq!(
493            err.unwrap_err().to_string(),
494            "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary"
495        );
496    }
497
498    #[test]
499    fn invalid_value_field_type() {
500        let fields = Fields::from(vec![
501            Field::new("metadata", DataType::BinaryView, true),
502            Field::new("value", DataType::Binary, true), // Not yet supported
503        ]);
504        let array = StructArray::new(
505            fields,
506            vec![make_binary_view_array(), make_binary_array()],
507            None,
508        );
509        let err = VariantArray::try_new(Arc::new(array));
510        assert_eq!(
511            err.unwrap_err().to_string(),
512            "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary"
513        );
514    }
515
516    fn make_binary_view_array() -> ArrayRef {
517        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
518    }
519
520    fn make_binary_array() -> ArrayRef {
521        Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
522    }
523
524    #[test]
525    fn all_null_shredding_state() {
526        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]);
527        let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap();
528
529        assert!(matches!(shredding_state, ShreddingState::AllNull { .. }));
530
531        // Verify metadata is preserved correctly
532        if let ShreddingState::AllNull { metadata: m } = shredding_state {
533            assert_eq!(m.len(), metadata.len());
534            assert_eq!(m.value(0), metadata.value(0));
535        }
536    }
537
538    #[test]
539    fn all_null_variant_array_construction() {
540        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
541        let nulls = NullBuffer::from(vec![false, false, false]); // all null
542
543        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
544        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
545
546        let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap();
547
548        // Verify the shredding state is AllNull
549        assert!(matches!(
550            variant_array.shredding_state(),
551            ShreddingState::AllNull { .. }
552        ));
553
554        // Verify all values are null
555        assert_eq!(variant_array.len(), 3);
556        assert!(!variant_array.is_valid(0));
557        assert!(!variant_array.is_valid(1));
558        assert!(!variant_array.is_valid(2));
559
560        // Verify that value() returns Variant::Null for all indices
561        for i in 0..variant_array.len() {
562            assert!(
563                !variant_array.is_valid(i),
564                "Expected value at index {i} to be null"
565            );
566        }
567    }
568
569    #[test]
570    fn value_field_present_but_all_null_should_be_unshredded() {
571        // This test demonstrates the issue: when a value field exists in schema
572        // but all its values are null, it should remain Unshredded, not AllNull
573        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
574
575        // Create a value field with all null values
576        let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
577        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
578        let value_data = value_array
579            .to_data()
580            .into_builder()
581            .nulls(Some(value_nulls))
582            .build()
583            .unwrap();
584        let value = BinaryViewArray::from(value_data);
585
586        let fields = Fields::from(vec![
587            Field::new("metadata", DataType::BinaryView, false),
588            Field::new("value", DataType::BinaryView, true), // Field exists in schema
589        ]);
590        let struct_array = StructArray::new(
591            fields,
592            vec![Arc::new(metadata), Arc::new(value)],
593            None, // struct itself is not null, just the value field is all null
594        );
595
596        let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap();
597
598        // This should be Unshredded, not AllNull, because value field exists in schema
599        assert!(matches!(
600            variant_array.shredding_state(),
601            ShreddingState::Unshredded { .. }
602        ));
603    }
604}