parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray};
21use arrow::buffer::NullBuffer;
22use arrow::datatypes::Int32Type;
23use arrow_schema::{ArrowError, DataType};
24use parquet_variant::Variant;
25use std::any::Any;
26use std::sync::Arc;
27
28/// An array of Parquet [`Variant`] values
29///
30/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
31/// `metadata` and `value` fields, and adds convenience methods to access
32/// the `Variant`s
33///
34/// See [`VariantArrayBuilder`] for constructing a `VariantArray`.
35///
36/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
37///
38/// # Specification
39///
40/// 1. This code follows the conventions for storing variants in Arrow `StructArray`
41///    defined by [Extension Type for Parquet Variant arrow] and this [document].
42///    At the time of this writing, this is not yet a standardized Arrow extension type.
43///
44/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
45/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
46#[derive(Debug)]
47pub struct VariantArray {
48    /// Reference to the underlying StructArray
49    inner: StructArray,
50
51    /// how is this variant array shredded?
52    shredding_state: ShreddingState,
53}
54
55impl VariantArray {
56    /// Creates a new `VariantArray` from a [`StructArray`].
57    ///
58    /// # Arguments
59    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
60    ///
61    /// # Returns
62    /// - A new instance of `VariantArray`.
63    ///
64    /// # Errors:
65    /// - If the `StructArray` does not contain the required fields
66    ///
67    /// # Requirements of the `StructArray`
68    ///
69    /// 1. A required field named `metadata` which is binary, large_binary, or
70    ///    binary_view
71    ///
72    /// 2. An optional field named `value` that is binary, large_binary, or
73    ///    binary_view
74    ///
75    /// 3. An optional field named `typed_value` which can be any primitive type
76    ///    or be a list, large_list, list_view or struct
77    ///
78    /// NOTE: It is also permissible for the metadata field to be
79    /// Dictionary-Encoded, preferably (but not required) with an index type of
80    /// int8.
81    ///
82    /// Currently, only [`BinaryViewArray`] are supported.
83    pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
84        let Some(inner) = inner.as_struct_opt() else {
85            return Err(ArrowError::InvalidArgumentError(
86                "Invalid VariantArray: requires StructArray as input".to_string(),
87            ));
88        };
89
90        // Note the specification allows for any order so we must search by name
91
92        // Ensure the StructArray has a metadata field of BinaryView
93        let Some(metadata_field) = inner.column_by_name("metadata") else {
94            return Err(ArrowError::InvalidArgumentError(
95                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
96            ));
97        };
98        let Some(metadata) = metadata_field.as_binary_view_opt() else {
99            return Err(ArrowError::NotYetImplemented(format!(
100                "VariantArray 'metadata' field must be BinaryView, got {}",
101                metadata_field.data_type()
102            )));
103        };
104
105        // Find the value field, if present
106        let value = inner
107            .column_by_name("value")
108            .map(|v| {
109                v.as_binary_view_opt().ok_or_else(|| {
110                    ArrowError::NotYetImplemented(format!(
111                        "VariantArray 'value' field must be BinaryView, got {}",
112                        v.data_type()
113                    ))
114                })
115            })
116            .transpose()?;
117
118        // Find the typed_value field, if present
119        let typed_value = inner.column_by_name("typed_value");
120
121        // Note these clones are cheap, they just bump the ref count
122        let inner = inner.clone();
123        let shredding_state =
124            ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?;
125
126        Ok(Self {
127            inner,
128            shredding_state,
129        })
130    }
131
132    /// Returns a reference to the underlying [`StructArray`].
133    pub fn inner(&self) -> &StructArray {
134        &self.inner
135    }
136
137    /// Returns the inner [`StructArray`], consuming self
138    pub fn into_inner(self) -> StructArray {
139        self.inner
140    }
141
142    /// Return the shredding state of this `VariantArray`
143    pub fn shredding_state(&self) -> &ShreddingState {
144        &self.shredding_state
145    }
146
147    /// Return the [`Variant`] instance stored at the given row
148    ///
149    /// Consistently with other Arrow arrays types, this API requires you to
150    /// check for nulls first using [`Self::is_valid`].
151    ///
152    /// # Panics
153    /// * if the index is out of bounds
154    /// * if the array value is null
155    ///
156    /// If this is a shredded variant but has no value at the shredded location, it
157    /// will return [`Variant::Null`].
158    ///
159    ///
160    /// # Performance Note
161    ///
162    /// This is certainly not the most efficient way to access values in a
163    /// `VariantArray`, but it is useful for testing and debugging.
164    ///
165    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
166    /// caller to ensure that the metadata and value were constructed correctly.
167    pub fn value(&self, index: usize) -> Variant<'_, '_> {
168        match &self.shredding_state {
169            ShreddingState::Unshredded { metadata, value } => {
170                Variant::new(metadata.value(index), value.value(index))
171            }
172            ShreddingState::Typed { typed_value, .. } => {
173                if typed_value.is_null(index) {
174                    Variant::Null
175                } else {
176                    typed_value_to_variant(typed_value, index)
177                }
178            }
179            ShreddingState::PartiallyShredded {
180                metadata,
181                value,
182                typed_value,
183            } => {
184                if typed_value.is_null(index) {
185                    Variant::new(metadata.value(index), value.value(index))
186                } else {
187                    typed_value_to_variant(typed_value, index)
188                }
189            }
190        }
191    }
192
193    /// Return a reference to the metadata field of the [`StructArray`]
194    pub fn metadata_field(&self) -> &BinaryViewArray {
195        self.shredding_state.metadata_field()
196    }
197
198    /// Return a reference to the value field of the `StructArray`
199    pub fn value_field(&self) -> Option<&BinaryViewArray> {
200        self.shredding_state.value_field()
201    }
202
203    /// Return a reference to the typed_value field of the `StructArray`, if present
204    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
205        self.shredding_state.typed_value_field()
206    }
207}
208
209/// Represents the shredding state of a [`VariantArray`]
210///
211/// [`VariantArray`]s can be shredded according to the [Parquet Variant
212/// Shredding Spec]. Shredding means that the actual value is stored in a typed
213/// `typed_field` instead of the generic `value` field.
214///
215/// Both value and typed_value are optional fields used together to encode a
216/// single value. Values in the two fields must be interpreted according to the
217/// following table (see [Parquet Variant Shredding Spec] for more details):
218///
219/// | value | typed_value | Meaning |
220/// |----------|--------------|---------|
221/// | null     | null         | The value is missing; only valid for shredded object fields |
222/// | non-null | null         | The value is present and may be any type, including `null` |
223/// | null     | non-null     | The value is present and is the shredded type |
224/// | non-null | non-null     | The value is present and is a partially shredded object |
225///
226/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
227#[derive(Debug)]
228pub enum ShreddingState {
229    // TODO: add missing state where there is neither value nor typed_value
230    // Missing { metadata: BinaryViewArray },
231    /// This variant has no typed_value field
232    Unshredded {
233        metadata: BinaryViewArray,
234        value: BinaryViewArray,
235    },
236    /// This variant has a typed_value field and no value field
237    /// meaning it is the shredded type
238    Typed {
239        metadata: BinaryViewArray,
240        typed_value: ArrayRef,
241    },
242    /// Partially shredded:
243    /// * value is an object
244    /// * typed_value is a shredded object.
245    ///
246    /// Note the spec says "Writers must not produce data where both value and
247    /// typed_value are non-null, unless the Variant value is an object."
248    PartiallyShredded {
249        metadata: BinaryViewArray,
250        value: BinaryViewArray,
251        typed_value: ArrayRef,
252    },
253}
254
255impl ShreddingState {
256    /// try to create a new `ShreddingState` from the given fields
257    pub fn try_new(
258        metadata: BinaryViewArray,
259        value: Option<BinaryViewArray>,
260        typed_value: Option<ArrayRef>,
261    ) -> Result<Self, ArrowError> {
262        match (metadata, value, typed_value) {
263            (metadata, Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded {
264                metadata,
265                value,
266                typed_value,
267            }),
268            (metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }),
269            (metadata, None, Some(typed_value)) => Ok(Self::Typed {
270                metadata,
271                typed_value,
272            }),
273            (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from(
274                "VariantArray has neither value nor typed_value field",
275            ))),
276        }
277    }
278
279    /// Return a reference to the metadata field
280    pub fn metadata_field(&self) -> &BinaryViewArray {
281        match self {
282            ShreddingState::Unshredded { metadata, .. } => metadata,
283            ShreddingState::Typed { metadata, .. } => metadata,
284            ShreddingState::PartiallyShredded { metadata, .. } => metadata,
285        }
286    }
287
288    /// Return a reference to the value field, if present
289    pub fn value_field(&self) -> Option<&BinaryViewArray> {
290        match self {
291            ShreddingState::Unshredded { value, .. } => Some(value),
292            ShreddingState::Typed { .. } => None,
293            ShreddingState::PartiallyShredded { value, .. } => Some(value),
294        }
295    }
296
297    /// Return a reference to the typed_value field, if present
298    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
299        match self {
300            ShreddingState::Unshredded { .. } => None,
301            ShreddingState::Typed { typed_value, .. } => Some(typed_value),
302            ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
303        }
304    }
305
306    /// Slice all the underlying arrays
307    pub fn slice(&self, offset: usize, length: usize) -> Self {
308        match self {
309            ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded {
310                metadata: metadata.slice(offset, length),
311                value: value.slice(offset, length),
312            },
313            ShreddingState::Typed {
314                metadata,
315                typed_value,
316            } => ShreddingState::Typed {
317                metadata: metadata.slice(offset, length),
318                typed_value: typed_value.slice(offset, length),
319            },
320            ShreddingState::PartiallyShredded {
321                metadata,
322                value,
323                typed_value,
324            } => ShreddingState::PartiallyShredded {
325                metadata: metadata.slice(offset, length),
326                value: value.slice(offset, length),
327                typed_value: typed_value.slice(offset, length),
328            },
329        }
330    }
331}
332
333/// returns the non-null element at index as a Variant
334fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> {
335    match typed_value.data_type() {
336        DataType::Int32 => {
337            let typed_value = typed_value.as_primitive::<Int32Type>();
338            Variant::from(typed_value.value(index))
339        }
340        // todo other types here (note this is very similar to cast_to_variant.rs)
341        // so it would be great to figure out how to share this code
342        _ => {
343            // We shouldn't panic in production code, but this is a
344            // placeholder until we implement more types
345            // TODO tickets: XXXX
346            debug_assert!(
347                false,
348                "Unsupported typed_value type: {:?}",
349                typed_value.data_type()
350            );
351            Variant::Null
352        }
353    }
354}
355
356impl Array for VariantArray {
357    fn as_any(&self) -> &dyn Any {
358        self
359    }
360
361    fn to_data(&self) -> ArrayData {
362        self.inner.to_data()
363    }
364
365    fn into_data(self) -> ArrayData {
366        self.inner.into_data()
367    }
368
369    fn data_type(&self) -> &DataType {
370        self.inner.data_type()
371    }
372
373    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
374        let inner = self.inner.slice(offset, length);
375        let shredding_state = self.shredding_state.slice(offset, length);
376        Arc::new(Self {
377            inner,
378            shredding_state,
379        })
380    }
381
382    fn len(&self) -> usize {
383        self.inner.len()
384    }
385
386    fn is_empty(&self) -> bool {
387        self.inner.is_empty()
388    }
389
390    fn offset(&self) -> usize {
391        self.inner.offset()
392    }
393
394    fn nulls(&self) -> Option<&NullBuffer> {
395        self.inner.nulls()
396    }
397
398    fn get_buffer_memory_size(&self) -> usize {
399        self.inner.get_buffer_memory_size()
400    }
401
402    fn get_array_memory_size(&self) -> usize {
403        self.inner.get_array_memory_size()
404    }
405}
406
407#[cfg(test)]
408mod test {
409    use super::*;
410    use arrow::array::{BinaryArray, BinaryViewArray};
411    use arrow_schema::{Field, Fields};
412
413    #[test]
414    fn invalid_not_a_struct_array() {
415        let array = make_binary_view_array();
416        // Should fail because the input is not a StructArray
417        let err = VariantArray::try_new(array);
418        assert_eq!(
419            err.unwrap_err().to_string(),
420            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
421        );
422    }
423
424    #[test]
425    fn invalid_missing_metadata() {
426        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
427        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
428        // Should fail because the StructArray does not contain a 'metadata' field
429        let err = VariantArray::try_new(Arc::new(array));
430        assert_eq!(
431            err.unwrap_err().to_string(),
432            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
433        );
434    }
435
436    #[test]
437    fn invalid_missing_value() {
438        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
439        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
440        // Should fail because the StructArray does not contain a 'value' field
441        let err = VariantArray::try_new(Arc::new(array));
442        assert_eq!(
443            err.unwrap_err().to_string(),
444            "Invalid argument error: VariantArray has neither value nor typed_value field"
445        );
446    }
447
448    #[test]
449    fn invalid_metadata_field_type() {
450        let fields = Fields::from(vec![
451            Field::new("metadata", DataType::Binary, true), // Not yet supported
452            Field::new("value", DataType::BinaryView, true),
453        ]);
454        let array = StructArray::new(
455            fields,
456            vec![make_binary_array(), make_binary_view_array()],
457            None,
458        );
459        let err = VariantArray::try_new(Arc::new(array));
460        assert_eq!(
461            err.unwrap_err().to_string(),
462            "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary"
463        );
464    }
465
466    #[test]
467    fn invalid_value_field_type() {
468        let fields = Fields::from(vec![
469            Field::new("metadata", DataType::BinaryView, true),
470            Field::new("value", DataType::Binary, true), // Not yet supported
471        ]);
472        let array = StructArray::new(
473            fields,
474            vec![make_binary_view_array(), make_binary_array()],
475            None,
476        );
477        let err = VariantArray::try_new(Arc::new(array));
478        assert_eq!(
479            err.unwrap_err().to_string(),
480            "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary"
481        );
482    }
483
484    fn make_binary_view_array() -> ArrayRef {
485        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
486    }
487
488    fn make_binary_array() -> ArrayRef {
489        Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
490    }
491}