parquet_variant_compute/variant_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray};
21use arrow::buffer::NullBuffer;
22use arrow::datatypes::Int32Type;
23use arrow_schema::{ArrowError, DataType};
24use parquet_variant::Variant;
25use std::any::Any;
26use std::sync::Arc;
27
28/// An array of Parquet [`Variant`] values
29///
30/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
31/// `metadata` and `value` fields, and adds convenience methods to access
32/// the `Variant`s
33///
34/// See [`VariantArrayBuilder`] for constructing a `VariantArray`.
35///
36/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
37///
38/// # Specification
39///
40/// 1. This code follows the conventions for storing variants in Arrow `StructArray`
41/// defined by [Extension Type for Parquet Variant arrow] and this [document].
42/// At the time of this writing, this is not yet a standardized Arrow extension type.
43///
44/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
45/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
46#[derive(Debug)]
47pub struct VariantArray {
48 /// Reference to the underlying StructArray
49 inner: StructArray,
50
51 /// how is this variant array shredded?
52 shredding_state: ShreddingState,
53}
54
55impl VariantArray {
56 /// Creates a new `VariantArray` from a [`StructArray`].
57 ///
58 /// # Arguments
59 /// - `inner` - The underlying [`StructArray`] that contains the variant data.
60 ///
61 /// # Returns
62 /// - A new instance of `VariantArray`.
63 ///
64 /// # Errors:
65 /// - If the `StructArray` does not contain the required fields
66 ///
67 /// # Requirements of the `StructArray`
68 ///
69 /// 1. A required field named `metadata` which is binary, large_binary, or
70 /// binary_view
71 ///
72 /// 2. An optional field named `value` that is binary, large_binary, or
73 /// binary_view
74 ///
75 /// 3. An optional field named `typed_value` which can be any primitive type
76 /// or be a list, large_list, list_view or struct
77 ///
78 /// NOTE: It is also permissible for the metadata field to be
79 /// Dictionary-Encoded, preferably (but not required) with an index type of
80 /// int8.
81 ///
82 /// Currently, only [`BinaryViewArray`] are supported.
83 pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
84 let Some(inner) = inner.as_struct_opt() else {
85 return Err(ArrowError::InvalidArgumentError(
86 "Invalid VariantArray: requires StructArray as input".to_string(),
87 ));
88 };
89
90 // Note the specification allows for any order so we must search by name
91
92 // Ensure the StructArray has a metadata field of BinaryView
93 let Some(metadata_field) = inner.column_by_name("metadata") else {
94 return Err(ArrowError::InvalidArgumentError(
95 "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
96 ));
97 };
98 let Some(metadata) = metadata_field.as_binary_view_opt() else {
99 return Err(ArrowError::NotYetImplemented(format!(
100 "VariantArray 'metadata' field must be BinaryView, got {}",
101 metadata_field.data_type()
102 )));
103 };
104
105 // Find the value field, if present
106 let value = inner
107 .column_by_name("value")
108 .map(|v| {
109 v.as_binary_view_opt().ok_or_else(|| {
110 ArrowError::NotYetImplemented(format!(
111 "VariantArray 'value' field must be BinaryView, got {}",
112 v.data_type()
113 ))
114 })
115 })
116 .transpose()?;
117
118 // Find the typed_value field, if present
119 let typed_value = inner.column_by_name("typed_value");
120
121 // Note these clones are cheap, they just bump the ref count
122 let inner = inner.clone();
123 let shredding_state =
124 ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?;
125
126 Ok(Self {
127 inner,
128 shredding_state,
129 })
130 }
131
132 /// Returns a reference to the underlying [`StructArray`].
133 pub fn inner(&self) -> &StructArray {
134 &self.inner
135 }
136
137 /// Returns the inner [`StructArray`], consuming self
138 pub fn into_inner(self) -> StructArray {
139 self.inner
140 }
141
142 /// Return the shredding state of this `VariantArray`
143 pub fn shredding_state(&self) -> &ShreddingState {
144 &self.shredding_state
145 }
146
147 /// Return the [`Variant`] instance stored at the given row
148 ///
149 /// Consistently with other Arrow arrays types, this API requires you to
150 /// check for nulls first using [`Self::is_valid`].
151 ///
152 /// # Panics
153 /// * if the index is out of bounds
154 /// * if the array value is null
155 ///
156 /// If this is a shredded variant but has no value at the shredded location, it
157 /// will return [`Variant::Null`].
158 ///
159 ///
160 /// # Performance Note
161 ///
162 /// This is certainly not the most efficient way to access values in a
163 /// `VariantArray`, but it is useful for testing and debugging.
164 ///
165 /// Note: Does not do deep validation of the [`Variant`], so it is up to the
166 /// caller to ensure that the metadata and value were constructed correctly.
167 pub fn value(&self, index: usize) -> Variant<'_, '_> {
168 match &self.shredding_state {
169 ShreddingState::Unshredded { metadata, value } => {
170 Variant::new(metadata.value(index), value.value(index))
171 }
172 ShreddingState::Typed { typed_value, .. } => {
173 if typed_value.is_null(index) {
174 Variant::Null
175 } else {
176 typed_value_to_variant(typed_value, index)
177 }
178 }
179 ShreddingState::PartiallyShredded {
180 metadata,
181 value,
182 typed_value,
183 } => {
184 if typed_value.is_null(index) {
185 Variant::new(metadata.value(index), value.value(index))
186 } else {
187 typed_value_to_variant(typed_value, index)
188 }
189 }
190 }
191 }
192
193 /// Return a reference to the metadata field of the [`StructArray`]
194 pub fn metadata_field(&self) -> &BinaryViewArray {
195 self.shredding_state.metadata_field()
196 }
197
198 /// Return a reference to the value field of the `StructArray`
199 pub fn value_field(&self) -> Option<&BinaryViewArray> {
200 self.shredding_state.value_field()
201 }
202
203 /// Return a reference to the typed_value field of the `StructArray`, if present
204 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
205 self.shredding_state.typed_value_field()
206 }
207}
208
209/// Represents the shredding state of a [`VariantArray`]
210///
211/// [`VariantArray`]s can be shredded according to the [Parquet Variant
212/// Shredding Spec]. Shredding means that the actual value is stored in a typed
213/// `typed_field` instead of the generic `value` field.
214///
215/// Both value and typed_value are optional fields used together to encode a
216/// single value. Values in the two fields must be interpreted according to the
217/// following table (see [Parquet Variant Shredding Spec] for more details):
218///
219/// | value | typed_value | Meaning |
220/// |----------|--------------|---------|
221/// | null | null | The value is missing; only valid for shredded object fields |
222/// | non-null | null | The value is present and may be any type, including `null` |
223/// | null | non-null | The value is present and is the shredded type |
224/// | non-null | non-null | The value is present and is a partially shredded object |
225///
226/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
227#[derive(Debug)]
228pub enum ShreddingState {
229 // TODO: add missing state where there is neither value nor typed_value
230 // Missing { metadata: BinaryViewArray },
231 /// This variant has no typed_value field
232 Unshredded {
233 metadata: BinaryViewArray,
234 value: BinaryViewArray,
235 },
236 /// This variant has a typed_value field and no value field
237 /// meaning it is the shredded type
238 Typed {
239 metadata: BinaryViewArray,
240 typed_value: ArrayRef,
241 },
242 /// Partially shredded:
243 /// * value is an object
244 /// * typed_value is a shredded object.
245 ///
246 /// Note the spec says "Writers must not produce data where both value and
247 /// typed_value are non-null, unless the Variant value is an object."
248 PartiallyShredded {
249 metadata: BinaryViewArray,
250 value: BinaryViewArray,
251 typed_value: ArrayRef,
252 },
253}
254
255impl ShreddingState {
256 /// try to create a new `ShreddingState` from the given fields
257 pub fn try_new(
258 metadata: BinaryViewArray,
259 value: Option<BinaryViewArray>,
260 typed_value: Option<ArrayRef>,
261 ) -> Result<Self, ArrowError> {
262 match (metadata, value, typed_value) {
263 (metadata, Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded {
264 metadata,
265 value,
266 typed_value,
267 }),
268 (metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }),
269 (metadata, None, Some(typed_value)) => Ok(Self::Typed {
270 metadata,
271 typed_value,
272 }),
273 (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from(
274 "VariantArray has neither value nor typed_value field",
275 ))),
276 }
277 }
278
279 /// Return a reference to the metadata field
280 pub fn metadata_field(&self) -> &BinaryViewArray {
281 match self {
282 ShreddingState::Unshredded { metadata, .. } => metadata,
283 ShreddingState::Typed { metadata, .. } => metadata,
284 ShreddingState::PartiallyShredded { metadata, .. } => metadata,
285 }
286 }
287
288 /// Return a reference to the value field, if present
289 pub fn value_field(&self) -> Option<&BinaryViewArray> {
290 match self {
291 ShreddingState::Unshredded { value, .. } => Some(value),
292 ShreddingState::Typed { .. } => None,
293 ShreddingState::PartiallyShredded { value, .. } => Some(value),
294 }
295 }
296
297 /// Return a reference to the typed_value field, if present
298 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
299 match self {
300 ShreddingState::Unshredded { .. } => None,
301 ShreddingState::Typed { typed_value, .. } => Some(typed_value),
302 ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
303 }
304 }
305
306 /// Slice all the underlying arrays
307 pub fn slice(&self, offset: usize, length: usize) -> Self {
308 match self {
309 ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded {
310 metadata: metadata.slice(offset, length),
311 value: value.slice(offset, length),
312 },
313 ShreddingState::Typed {
314 metadata,
315 typed_value,
316 } => ShreddingState::Typed {
317 metadata: metadata.slice(offset, length),
318 typed_value: typed_value.slice(offset, length),
319 },
320 ShreddingState::PartiallyShredded {
321 metadata,
322 value,
323 typed_value,
324 } => ShreddingState::PartiallyShredded {
325 metadata: metadata.slice(offset, length),
326 value: value.slice(offset, length),
327 typed_value: typed_value.slice(offset, length),
328 },
329 }
330 }
331}
332
333/// returns the non-null element at index as a Variant
334fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> {
335 match typed_value.data_type() {
336 DataType::Int32 => {
337 let typed_value = typed_value.as_primitive::<Int32Type>();
338 Variant::from(typed_value.value(index))
339 }
340 // todo other types here (note this is very similar to cast_to_variant.rs)
341 // so it would be great to figure out how to share this code
342 _ => {
343 // We shouldn't panic in production code, but this is a
344 // placeholder until we implement more types
345 // TODO tickets: XXXX
346 debug_assert!(
347 false,
348 "Unsupported typed_value type: {:?}",
349 typed_value.data_type()
350 );
351 Variant::Null
352 }
353 }
354}
355
356impl Array for VariantArray {
357 fn as_any(&self) -> &dyn Any {
358 self
359 }
360
361 fn to_data(&self) -> ArrayData {
362 self.inner.to_data()
363 }
364
365 fn into_data(self) -> ArrayData {
366 self.inner.into_data()
367 }
368
369 fn data_type(&self) -> &DataType {
370 self.inner.data_type()
371 }
372
373 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
374 let inner = self.inner.slice(offset, length);
375 let shredding_state = self.shredding_state.slice(offset, length);
376 Arc::new(Self {
377 inner,
378 shredding_state,
379 })
380 }
381
382 fn len(&self) -> usize {
383 self.inner.len()
384 }
385
386 fn is_empty(&self) -> bool {
387 self.inner.is_empty()
388 }
389
390 fn offset(&self) -> usize {
391 self.inner.offset()
392 }
393
394 fn nulls(&self) -> Option<&NullBuffer> {
395 self.inner.nulls()
396 }
397
398 fn get_buffer_memory_size(&self) -> usize {
399 self.inner.get_buffer_memory_size()
400 }
401
402 fn get_array_memory_size(&self) -> usize {
403 self.inner.get_array_memory_size()
404 }
405}
406
407#[cfg(test)]
408mod test {
409 use super::*;
410 use arrow::array::{BinaryArray, BinaryViewArray};
411 use arrow_schema::{Field, Fields};
412
413 #[test]
414 fn invalid_not_a_struct_array() {
415 let array = make_binary_view_array();
416 // Should fail because the input is not a StructArray
417 let err = VariantArray::try_new(array);
418 assert_eq!(
419 err.unwrap_err().to_string(),
420 "Invalid argument error: Invalid VariantArray: requires StructArray as input"
421 );
422 }
423
424 #[test]
425 fn invalid_missing_metadata() {
426 let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
427 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
428 // Should fail because the StructArray does not contain a 'metadata' field
429 let err = VariantArray::try_new(Arc::new(array));
430 assert_eq!(
431 err.unwrap_err().to_string(),
432 "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
433 );
434 }
435
436 #[test]
437 fn invalid_missing_value() {
438 let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
439 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
440 // Should fail because the StructArray does not contain a 'value' field
441 let err = VariantArray::try_new(Arc::new(array));
442 assert_eq!(
443 err.unwrap_err().to_string(),
444 "Invalid argument error: VariantArray has neither value nor typed_value field"
445 );
446 }
447
448 #[test]
449 fn invalid_metadata_field_type() {
450 let fields = Fields::from(vec![
451 Field::new("metadata", DataType::Binary, true), // Not yet supported
452 Field::new("value", DataType::BinaryView, true),
453 ]);
454 let array = StructArray::new(
455 fields,
456 vec![make_binary_array(), make_binary_view_array()],
457 None,
458 );
459 let err = VariantArray::try_new(Arc::new(array));
460 assert_eq!(
461 err.unwrap_err().to_string(),
462 "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary"
463 );
464 }
465
466 #[test]
467 fn invalid_value_field_type() {
468 let fields = Fields::from(vec![
469 Field::new("metadata", DataType::BinaryView, true),
470 Field::new("value", DataType::Binary, true), // Not yet supported
471 ]);
472 let array = StructArray::new(
473 fields,
474 vec![make_binary_view_array(), make_binary_array()],
475 None,
476 );
477 let err = VariantArray::try_new(Arc::new(array));
478 assert_eq!(
479 err.unwrap_err().to_string(),
480 "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary"
481 );
482 }
483
484 fn make_binary_view_array() -> ArrayRef {
485 Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
486 }
487
488 fn make_binary_array() -> ArrayRef {
489 Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
490 }
491}