parquet_variant_compute/
variant_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use arrow::array::{Array, ArrayData, ArrayRef, AsArray, StructArray};
21use arrow::buffer::NullBuffer;
22use arrow_schema::{ArrowError, DataType};
23use parquet_variant::Variant;
24use std::any::Any;
25use std::sync::Arc;
26
27/// An array of Parquet [`Variant`] values
28///
29/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
30/// `metadata` and `value` fields, and adds convenience methods to access
31/// the `Variant`s
32///
33/// See [`VariantArrayBuilder`] for constructing a `VariantArray`.
34///
35/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
36///
37/// # Specification
38///
39/// 1. This code follows the conventions for storing variants in Arrow `StructArray`
40///    defined by [Extension Type for Parquet Variant arrow] and this [document].
41///    At the time of this writing, this is not yet a standardized Arrow extension type.
42///
43/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
44/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
45#[derive(Debug)]
46pub struct VariantArray {
47    /// StructArray of up to three fields:
48    ///
49    /// 1. A required field named `metadata` which is binary, large_binary, or
50    ///    binary_view
51    ///
52    /// 2. An optional field named `value` that is binary, large_binary, or
53    ///    binary_view
54    ///
55    /// 3. An optional field named `typed_value` which can be any primitive type
56    ///    or be a list, large_list, list_view or struct
57    ///
58    /// NOTE: It is also permissible for the metadata field to be
59    /// Dictionary-Encoded, preferably (but not required) with an index type of
60    /// int8.
61    inner: StructArray,
62
63    /// Reference to the metadata column of inner
64    metadata_ref: ArrayRef,
65
66    /// Reference to the value column of inner
67    value_ref: ArrayRef,
68}
69
70impl VariantArray {
71    /// Creates a new `VariantArray` from a [`StructArray`].
72    ///
73    /// # Arguments
74    /// - `inner` - The underlying [`StructArray`] that contains the variant data.
75    ///
76    /// # Returns
77    /// - A new instance of `VariantArray`.
78    ///
79    /// # Errors:
80    /// - If the `StructArray` does not contain the required fields
81    ///
82    /// # Current support
83    /// This structure does not (yet) support the full Arrow Variant Array specification.
84    ///
85    /// Only `StructArrays` with `metadata` and `value` fields that are
86    /// [`BinaryViewArray`] are supported. Shredded values are not currently supported
87    /// nor are using types other than `BinaryViewArray`
88    ///
89    /// [`BinaryViewArray`]: arrow::array::BinaryViewArray
90    pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
91        let Some(inner) = inner.as_struct_opt() else {
92            return Err(ArrowError::InvalidArgumentError(
93                "Invalid VariantArray: requires StructArray as input".to_string(),
94            ));
95        };
96        // Ensure the StructArray has a metadata field of BinaryView
97
98        let Some(metadata_field) = VariantArray::find_metadata_field(inner) else {
99            return Err(ArrowError::InvalidArgumentError(
100                "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
101            ));
102        };
103        if metadata_field.data_type() != &DataType::BinaryView {
104            return Err(ArrowError::NotYetImplemented(format!(
105                "VariantArray 'metadata' field must be BinaryView, got {}",
106                metadata_field.data_type()
107            )));
108        }
109        let Some(value_field) = VariantArray::find_value_field(inner) else {
110            return Err(ArrowError::InvalidArgumentError(
111                "Invalid VariantArray: StructArray must contain a 'value' field".to_string(),
112            ));
113        };
114        if value_field.data_type() != &DataType::BinaryView {
115            return Err(ArrowError::NotYetImplemented(format!(
116                "VariantArray 'value' field must be BinaryView, got {}",
117                value_field.data_type()
118            )));
119        }
120
121        Ok(Self {
122            inner: inner.clone(),
123            metadata_ref: metadata_field,
124            value_ref: value_field,
125        })
126    }
127
128    /// Returns a reference to the underlying [`StructArray`].
129    pub fn inner(&self) -> &StructArray {
130        &self.inner
131    }
132
133    /// Returns the inner [`StructArray`], consuming self
134    pub fn into_inner(self) -> StructArray {
135        self.inner
136    }
137
138    /// Return the [`Variant`] instance stored at the given row
139    ///
140    /// Panics if the index is out of bounds.
141    ///
142    /// Note: Does not do deep validation of the [`Variant`], so it is up to the
143    /// caller to ensure that the metadata and value were constructed correctly.
144    pub fn value(&self, index: usize) -> Variant {
145        let metadata = self.metadata_field().as_binary_view().value(index);
146        let value = self.value_field().as_binary_view().value(index);
147        Variant::new(metadata, value)
148    }
149
150    fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> {
151        array.column_by_name("metadata").cloned()
152    }
153
154    fn find_value_field(array: &StructArray) -> Option<ArrayRef> {
155        array.column_by_name("value").cloned()
156    }
157
158    /// Return a reference to the metadata field of the [`StructArray`]
159    pub fn metadata_field(&self) -> &ArrayRef {
160        // spec says fields order is not guaranteed, so we search by name
161        &self.metadata_ref
162    }
163
164    /// Return a reference to the value field of the `StructArray`
165    pub fn value_field(&self) -> &ArrayRef {
166        // spec says fields order is not guaranteed, so we search by name
167        &self.value_ref
168    }
169}
170
171impl Array for VariantArray {
172    fn as_any(&self) -> &dyn Any {
173        self
174    }
175
176    fn to_data(&self) -> ArrayData {
177        self.inner.to_data()
178    }
179
180    fn into_data(self) -> ArrayData {
181        self.inner.into_data()
182    }
183
184    fn data_type(&self) -> &DataType {
185        self.inner.data_type()
186    }
187
188    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
189        let slice = self.inner.slice(offset, length);
190        let met = self.metadata_ref.slice(offset, length);
191        let val = self.value_ref.slice(offset, length);
192        Arc::new(Self {
193            inner: slice,
194            metadata_ref: met,
195            value_ref: val,
196        })
197    }
198
199    fn len(&self) -> usize {
200        self.inner.len()
201    }
202
203    fn is_empty(&self) -> bool {
204        self.inner.is_empty()
205    }
206
207    fn offset(&self) -> usize {
208        self.inner.offset()
209    }
210
211    fn nulls(&self) -> Option<&NullBuffer> {
212        self.inner.nulls()
213    }
214
215    fn get_buffer_memory_size(&self) -> usize {
216        self.inner.get_buffer_memory_size()
217    }
218
219    fn get_array_memory_size(&self) -> usize {
220        self.inner.get_array_memory_size()
221    }
222}
223
224#[cfg(test)]
225mod test {
226    use super::*;
227    use arrow::array::{BinaryArray, BinaryViewArray};
228    use arrow_schema::{Field, Fields};
229
230    #[test]
231    fn invalid_not_a_struct_array() {
232        let array = make_binary_view_array();
233        // Should fail because the input is not a StructArray
234        let err = VariantArray::try_new(array);
235        assert_eq!(
236            err.unwrap_err().to_string(),
237            "Invalid argument error: Invalid VariantArray: requires StructArray as input"
238        );
239    }
240
241    #[test]
242    fn invalid_missing_metadata() {
243        let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
244        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
245        // Should fail because the StructArray does not contain a 'metadata' field
246        let err = VariantArray::try_new(Arc::new(array));
247        assert_eq!(
248            err.unwrap_err().to_string(),
249            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
250        );
251    }
252
253    #[test]
254    fn invalid_missing_value() {
255        let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
256        let array = StructArray::new(fields, vec![make_binary_view_array()], None);
257        // Should fail because the StructArray does not contain a 'value' field
258        let err = VariantArray::try_new(Arc::new(array));
259        assert_eq!(
260            err.unwrap_err().to_string(),
261            "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field"
262        );
263    }
264
265    #[test]
266    fn invalid_metadata_field_type() {
267        let fields = Fields::from(vec![
268            Field::new("metadata", DataType::Binary, true), // Not yet supported
269            Field::new("value", DataType::BinaryView, true),
270        ]);
271        let array = StructArray::new(
272            fields,
273            vec![make_binary_array(), make_binary_view_array()],
274            None,
275        );
276        let err = VariantArray::try_new(Arc::new(array));
277        assert_eq!(
278            err.unwrap_err().to_string(),
279            "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary"
280        );
281    }
282
283    #[test]
284    fn invalid_value_field_type() {
285        let fields = Fields::from(vec![
286            Field::new("metadata", DataType::BinaryView, true),
287            Field::new("value", DataType::Binary, true), // Not yet supported
288        ]);
289        let array = StructArray::new(
290            fields,
291            vec![make_binary_view_array(), make_binary_array()],
292            None,
293        );
294        let err = VariantArray::try_new(Arc::new(array));
295        assert_eq!(
296            err.unwrap_err().to_string(),
297            "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary"
298        );
299    }
300
301    fn make_binary_view_array() -> ArrayRef {
302        Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
303    }
304
305    fn make_binary_array() -> ArrayRef {
306        Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
307    }
308}