parquet_variant_compute/variant_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArray`] implementation
19
20use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value};
21use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
22use arrow::buffer::NullBuffer;
23use arrow::compute::cast;
24use arrow::datatypes::{
25 Date32Type, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type,
26 TimestampMicrosecondType, TimestampNanosecondType,
27};
28use arrow_schema::extension::ExtensionType;
29use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
30use chrono::DateTime;
31use parquet_variant::Uuid;
32use parquet_variant::Variant;
33
34use std::borrow::Cow;
35use std::sync::Arc;
36
37/// Arrow Variant [`ExtensionType`].
38///
39/// Represents the canonical Arrow Extension Type for storing variants.
40/// See [`VariantArray`] for more examples of using this extension type.
41pub struct VariantType;
42
43impl ExtensionType for VariantType {
44 const NAME: &'static str = "arrow.parquet.variant";
45
46 // Variants extension metadata is an empty string
47 // <https://github.com/apache/arrow/blob/d803afcc43f5d132506318fd9e162d33b2c3d4cd/docs/source/format/CanonicalExtensions.rst?plain=1#L473>
48 type Metadata = &'static str;
49
50 fn metadata(&self) -> &Self::Metadata {
51 &""
52 }
53
54 fn serialize_metadata(&self) -> Option<String> {
55 Some(String::new())
56 }
57
58 fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
59 Ok("")
60 }
61
62 fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
63 if matches!(data_type, DataType::Struct(_)) {
64 Ok(())
65 } else {
66 Err(ArrowError::InvalidArgumentError(format!(
67 "VariantType only supports StructArray, got {data_type}"
68 )))
69 }
70 }
71
72 fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
73 Self.supports_data_type(data_type)?;
74 Ok(Self)
75 }
76}
77
78/// An array of Parquet [`Variant`] values
79///
80/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
81/// `metadata` and `value` fields, and adds convenience methods to access
82/// the [`Variant`]s.
83///
84/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row.
85///
86/// See the examples below from converting between `VariantArray` and
87/// `StructArray`.
88///
89/// [`VariantArrayBuilder`]: crate::VariantArrayBuilder
90///
91/// # Documentation
92///
93/// At the time of this writing, Variant has been accepted as an official
94/// extension type but not been published to the [official list of extension
95/// types] on the Apache Arrow website. See the [Extension Type for Parquet
96/// Variant arrow] ticket for more details.
97///
98/// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908
99/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html
100///
101/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension
102///
103/// Arrow Arrays only provide [`DataType`], but the extension type information
104/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or
105/// [`Field`] to check for the extension type.
106///
107/// [`Schema`]: arrow_schema::Schema
108/// ```
109/// # use arrow::array::StructArray;
110/// # use arrow_schema::{Schema, Field, DataType};
111/// # use parquet_variant::Variant;
112/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
113/// # fn get_variant_array() -> VariantArray {
114/// # let mut builder = VariantArrayBuilder::new(10);
115/// # builder.append_variant(Variant::from("such wow"));
116/// # builder.build()
117/// # }
118/// # fn get_schema() -> Schema {
119/// # Schema::new(vec![
120/// # Field::new("id", DataType::Int32, false),
121/// # get_variant_array().field("var"),
122/// # ])
123/// # }
124/// let schema = get_schema();
125/// assert_eq!(schema.fields().len(), 2);
126/// // first field is not a Variant
127/// assert!(schema.field(0).try_extension_type::<VariantType>().is_err());
128/// // second field is a Variant
129/// assert!(schema.field(1).try_extension_type::<VariantType>().is_ok());
130/// ```
131///
132/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
133///
134/// You can construct the correct [`Field`] for a [`VariantArray`] using the
135/// [`VariantArray::field`] method.
136///
137/// ```
138/// # use arrow_schema::{Schema, Field, DataType};
139/// # use parquet_variant::Variant;
140/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
141/// # fn get_variant_array() -> VariantArray {
142/// # let mut builder = VariantArrayBuilder::new(10);
143/// # builder.append_variant(Variant::from("such wow"));
144/// # builder.build()
145/// # }
146/// let variant_array = get_variant_array();
147/// // First field is an integer id, second field is a variant
148/// let schema = Schema::new(vec![
149/// Field::new("id", DataType::Int32, false),
150/// // call VariantArray::field to get the correct Field
151/// variant_array.field("var"),
152/// ]);
153/// ```
154///
155/// You can also construct the [`Field`] using [`VariantType`] directly
156///
157/// ```
158/// # use arrow_schema::{Schema, Field, DataType};
159/// # use parquet_variant::Variant;
160/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType};
161/// # fn get_variant_array() -> VariantArray {
162/// # let mut builder = VariantArrayBuilder::new(10);
163/// # builder.append_variant(Variant::from("such wow"));
164/// # builder.build()
165/// # }
166/// # let variant_array = get_variant_array();
167/// // The DataType of a VariantArray varies depending on how it is shredded
168/// let data_type = variant_array.data_type().clone();
169/// // First field is an integer id, second field is a variant
170/// let schema = Schema::new(vec![
171/// Field::new("id", DataType::Int32, false),
172/// Field::new("var", data_type, false)
173/// // Add extension metadata to the field using `VariantType`
174/// .with_extension_type(VariantType),
175/// ]);
176/// ```
177///
178/// # Example: Converting a [`VariantArray`] to a [`StructArray`]
179///
180/// ```
181/// # use arrow::array::StructArray;
182/// # use parquet_variant::Variant;
183/// # use parquet_variant_compute::VariantArrayBuilder;
184/// // Create Variant Array
185/// let mut builder = VariantArrayBuilder::new(10);
186/// builder.append_variant(Variant::from("such wow"));
187/// let variant_array = builder.build();
188/// // convert to StructArray
189/// let struct_array: StructArray = variant_array.into();
190/// ```
191///
192/// # Example: Converting a [`StructArray`] to a [`VariantArray`]
193///
194/// ```
195/// # use arrow::array::StructArray;
196/// # use parquet_variant::Variant;
197/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray};
198/// # fn get_struct_array() -> StructArray {
199/// # let mut builder = VariantArrayBuilder::new(10);
200/// # builder.append_variant(Variant::from("such wow"));
201/// # builder.build().into()
202/// # }
203/// let struct_array: StructArray = get_struct_array();
204/// // try and create a VariantArray from it
205/// let variant_array = VariantArray::try_new(&struct_array).unwrap();
206/// assert_eq!(variant_array.value(0), Variant::from("such wow"));
207/// ```
208///
209#[derive(Clone, Debug)]
210pub struct VariantArray {
211 /// Reference to the underlying StructArray
212 inner: StructArray,
213
214 /// The metadata column of this variant
215 metadata: BinaryViewArray,
216
217 /// how is this variant array shredded?
218 shredding_state: ShreddingState,
219}
220
221impl VariantArray {
222 /// Creates a new `VariantArray` from a [`StructArray`].
223 ///
224 /// # Arguments
225 /// - `inner` - The underlying [`StructArray`] that contains the variant data.
226 ///
227 /// # Returns
228 /// - A new instance of `VariantArray`.
229 ///
230 /// # Errors:
231 /// - If the `StructArray` does not contain the required fields
232 ///
233 /// # Requirements of the `StructArray`
234 ///
235 /// 1. A required field named `metadata` which is binary, large_binary, or
236 /// binary_view
237 ///
238 /// 2. An optional field named `value` that is binary, large_binary, or
239 /// binary_view
240 ///
241 /// 3. An optional field named `typed_value` which can be any primitive type
242 /// or be a list, large_list, list_view or struct
243 ///
244 /// NOTE: It is also permissible for the metadata field to be
245 /// Dictionary-Encoded, preferably (but not required) with an index type of
246 /// int8.
247 ///
248 /// Currently, only [`BinaryViewArray`] are supported.
249 pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
250 // Workaround lack of support for Binary
251 // https://github.com/apache/arrow-rs/issues/8387
252 let inner = cast_to_binary_view_arrays(inner)?;
253
254 let Some(inner) = inner.as_struct_opt() else {
255 return Err(ArrowError::InvalidArgumentError(
256 "Invalid VariantArray: requires StructArray as input".to_string(),
257 ));
258 };
259
260 // Note the specification allows for any order so we must search by name
261
262 // Ensure the StructArray has a metadata field of BinaryView
263 let Some(metadata_field) = inner.column_by_name("metadata") else {
264 return Err(ArrowError::InvalidArgumentError(
265 "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
266 ));
267 };
268 let Some(metadata) = metadata_field.as_binary_view_opt() else {
269 return Err(ArrowError::NotYetImplemented(format!(
270 "VariantArray 'metadata' field must be BinaryView, got {}",
271 metadata_field.data_type()
272 )));
273 };
274
275 // Note these clones are cheap, they just bump the ref count
276 Ok(Self {
277 inner: inner.clone(),
278 metadata: metadata.clone(),
279 shredding_state: ShreddingState::try_from(inner)?,
280 })
281 }
282
283 pub(crate) fn from_parts(
284 metadata: BinaryViewArray,
285 value: Option<BinaryViewArray>,
286 typed_value: Option<ArrayRef>,
287 nulls: Option<NullBuffer>,
288 ) -> Self {
289 let mut builder =
290 StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false);
291 if let Some(value) = value.clone() {
292 builder = builder.with_field("value", Arc::new(value), true);
293 }
294 if let Some(typed_value) = typed_value.clone() {
295 builder = builder.with_field("typed_value", typed_value, true);
296 }
297 if let Some(nulls) = nulls {
298 builder = builder.with_nulls(nulls);
299 }
300
301 Self {
302 inner: builder.build(),
303 metadata,
304 shredding_state: ShreddingState::new(value, typed_value),
305 }
306 }
307
308 /// Returns a reference to the underlying [`StructArray`].
309 pub fn inner(&self) -> &StructArray {
310 &self.inner
311 }
312
313 /// Returns the inner [`StructArray`], consuming self
314 pub fn into_inner(self) -> StructArray {
315 self.inner
316 }
317
318 /// Return the shredding state of this `VariantArray`
319 pub fn shredding_state(&self) -> &ShreddingState {
320 &self.shredding_state
321 }
322
323 /// Return the [`Variant`] instance stored at the given row
324 ///
325 /// Note: This method does not check for nulls and the value is arbitrary
326 /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
327 ///
328 /// # Panics
329 /// * if the index is out of bounds
330 /// * if the array value is null
331 ///
332 /// If this is a shredded variant but has no value at the shredded location, it
333 /// will return [`Variant::Null`].
334 ///
335 ///
336 /// # Performance Note
337 ///
338 /// This is certainly not the most efficient way to access values in a
339 /// `VariantArray`, but it is useful for testing and debugging.
340 ///
341 /// Note: Does not do deep validation of the [`Variant`], so it is up to the
342 /// caller to ensure that the metadata and value were constructed correctly.
343 pub fn value(&self, index: usize) -> Variant<'_, '_> {
344 match (self.typed_value_field(), self.value_field()) {
345 // Always prefer typed_value, if available
346 (Some(typed_value), value) if typed_value.is_valid(index) => {
347 typed_value_to_variant(typed_value, value, index)
348 }
349 // Otherwise fall back to value, if available
350 (_, Some(value)) if value.is_valid(index) => {
351 Variant::new(self.metadata.value(index), value.value(index))
352 }
353 // It is technically invalid for neither value nor typed_value fields to be available,
354 // but the spec specifically requires readers to return Variant::Null in this case.
355 _ => Variant::Null,
356 }
357 }
358
359 /// Return a reference to the metadata field of the [`StructArray`]
360 pub fn metadata_field(&self) -> &BinaryViewArray {
361 &self.metadata
362 }
363
364 /// Return a reference to the value field of the `StructArray`
365 pub fn value_field(&self) -> Option<&BinaryViewArray> {
366 self.shredding_state.value_field()
367 }
368
369 /// Return a reference to the typed_value field of the `StructArray`, if present
370 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
371 self.shredding_state.typed_value_field()
372 }
373
374 /// Return a field to represent this VariantArray in a `Schema` with
375 /// a particular name
376 pub fn field(&self, name: impl Into<String>) -> Field {
377 Field::new(
378 name.into(),
379 self.data_type().clone(),
380 self.inner.is_nullable(),
381 )
382 .with_extension_type(VariantType)
383 }
384
385 /// Returns a new DataType representing this VariantArray's inner type
386 pub fn data_type(&self) -> &DataType {
387 self.inner.data_type()
388 }
389
390 pub fn slice(&self, offset: usize, length: usize) -> Self {
391 let inner = self.inner.slice(offset, length);
392 let metadata = self.metadata.slice(offset, length);
393 let shredding_state = self.shredding_state.slice(offset, length);
394 Self {
395 inner,
396 metadata,
397 shredding_state,
398 }
399 }
400
401 pub fn len(&self) -> usize {
402 self.inner.len()
403 }
404
405 pub fn is_empty(&self) -> bool {
406 self.inner.is_empty()
407 }
408
409 pub fn nulls(&self) -> Option<&NullBuffer> {
410 self.inner.nulls()
411 }
412
413 /// Is the element at index null?
414 pub fn is_null(&self, index: usize) -> bool {
415 self.nulls().is_some_and(|n| n.is_null(index))
416 }
417
418 /// Is the element at index valid (not null)?
419 pub fn is_valid(&self, index: usize) -> bool {
420 !self.is_null(index)
421 }
422}
423
424impl From<VariantArray> for StructArray {
425 fn from(variant_array: VariantArray) -> Self {
426 variant_array.into_inner()
427 }
428}
429
430impl From<VariantArray> for ArrayRef {
431 fn from(variant_array: VariantArray) -> Self {
432 Arc::new(variant_array.into_inner())
433 }
434}
435
436/// One shredded field of a partially or prefectly shredded variant. For example, suppose the
437/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
438/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
439/// is:
440///
441/// ```text
442/// v: VARIANT {
443/// metadata: BINARY,
444/// value: BINARY,
445/// typed_value: STRUCT {
446/// a: SHREDDED_VARIANT_FIELD {
447/// value: BINARY,
448/// typed_value: STRUCT {
449/// a: SHREDDED_VARIANT_FIELD {
450/// value: BINARY,
451/// typed_value: INT,
452/// },
453/// },
454/// },
455/// },
456/// }
457/// ```
458///
459/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
460/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
461/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
462/// single expected field `a`).
463///
464/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
465/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
466/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
467///
468/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
469/// variant value (which could be `Variant::Null`).
470#[derive(Debug)]
471pub struct ShreddedVariantFieldArray {
472 /// Reference to the underlying StructArray
473 inner: StructArray,
474 shredding_state: ShreddingState,
475}
476
477#[allow(unused)]
478impl ShreddedVariantFieldArray {
479 /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
480 ///
481 /// # Arguments
482 /// - `inner` - The underlying [`StructArray`] that contains the variant data.
483 ///
484 /// # Returns
485 /// - A new instance of `ShreddedVariantFieldArray`.
486 ///
487 /// # Errors:
488 /// - If the `StructArray` does not contain the required fields
489 ///
490 /// # Requirements of the `StructArray`
491 ///
492 /// 1. An optional field named `value` that is binary, large_binary, or
493 /// binary_view
494 ///
495 /// 2. An optional field named `typed_value` which can be any primitive type
496 /// or be a list, large_list, list_view or struct
497 ///
498 /// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
499 pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
500 let Some(inner_struct) = inner.as_struct_opt() else {
501 return Err(ArrowError::InvalidArgumentError(
502 "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
503 ));
504 };
505
506 // Note this clone is cheap, it just bumps the ref count
507 Ok(Self {
508 inner: inner_struct.clone(),
509 shredding_state: ShreddingState::try_from(inner_struct)?,
510 })
511 }
512
513 /// Return the shredding state of this `VariantArray`
514 pub fn shredding_state(&self) -> &ShreddingState {
515 &self.shredding_state
516 }
517
518 /// Return a reference to the value field of the `StructArray`
519 pub fn value_field(&self) -> Option<&BinaryViewArray> {
520 self.shredding_state.value_field()
521 }
522
523 /// Return a reference to the typed_value field of the `StructArray`, if present
524 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
525 self.shredding_state.typed_value_field()
526 }
527
528 /// Returns a reference to the underlying [`StructArray`].
529 pub fn inner(&self) -> &StructArray {
530 &self.inner
531 }
532
533 pub(crate) fn from_parts(
534 value: Option<BinaryViewArray>,
535 typed_value: Option<ArrayRef>,
536 nulls: Option<NullBuffer>,
537 ) -> Self {
538 let mut builder = StructArrayBuilder::new();
539 if let Some(value) = value.clone() {
540 builder = builder.with_field("value", Arc::new(value), true);
541 }
542 if let Some(typed_value) = typed_value.clone() {
543 builder = builder.with_field("typed_value", typed_value, true);
544 }
545 if let Some(nulls) = nulls {
546 builder = builder.with_nulls(nulls);
547 }
548
549 Self {
550 inner: builder.build(),
551 shredding_state: ShreddingState::new(value, typed_value),
552 }
553 }
554
555 /// Returns the inner [`StructArray`], consuming self
556 pub fn into_inner(self) -> StructArray {
557 self.inner
558 }
559
560 pub fn data_type(&self) -> &DataType {
561 self.inner.data_type()
562 }
563
564 pub fn len(&self) -> usize {
565 self.inner.len()
566 }
567
568 pub fn is_empty(&self) -> bool {
569 self.inner.is_empty()
570 }
571
572 pub fn offset(&self) -> usize {
573 self.inner.offset()
574 }
575
576 pub fn nulls(&self) -> Option<&NullBuffer> {
577 // According to the shredding spec, ShreddedVariantFieldArray should be
578 // physically non-nullable - SQL NULL is inferred by both value and
579 // typed_value being physically NULL
580 None
581 }
582 /// Is the element at index null?
583 pub fn is_null(&self, index: usize) -> bool {
584 self.nulls().is_some_and(|n| n.is_null(index))
585 }
586
587 /// Is the element at index valid (not null)?
588 pub fn is_valid(&self, index: usize) -> bool {
589 !self.is_null(index)
590 }
591}
592
593impl From<ShreddedVariantFieldArray> for ArrayRef {
594 fn from(array: ShreddedVariantFieldArray) -> Self {
595 Arc::new(array.into_inner())
596 }
597}
598
599impl From<ShreddedVariantFieldArray> for StructArray {
600 fn from(array: ShreddedVariantFieldArray) -> Self {
601 array.into_inner()
602 }
603}
604
605/// Represents the shredding state of a [`VariantArray`]
606///
607/// [`VariantArray`]s can be shredded according to the [Parquet Variant
608/// Shredding Spec]. Shredding means that the actual value is stored in a typed
609/// `typed_field` instead of the generic `value` field.
610///
611/// Both value and typed_value are optional fields used together to encode a
612/// single value. Values in the two fields must be interpreted according to the
613/// following table (see [Parquet Variant Shredding Spec] for more details):
614///
615/// | value | typed_value | Meaning |
616/// |----------|--------------|---------|
617/// | NULL | NULL | The value is missing; only valid for shredded object fields |
618/// | non-NULL | NULL | The value is present and may be any type, including [`Variant::Null`] |
619/// | NULL | non-NULL | The value is present and is the shredded type |
620/// | non-NULL | non-NULL | The value is present and is a partially shredded object |
621///
622///
623/// Applying the above rules to entire columns, we obtain the following:
624///
625/// | value | typed_value | Meaning |
626/// |--------|-------------|---------|
627/// | -- | -- | **Missing**: The value is always missing; only valid for shredded object fields |
628/// | exists | -- | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
629/// | -- | exists | **Perfectly shredded**: If present, the value is always the shredded type |
630/// | exists | exists | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
631///
632/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
633/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
634/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
635/// (partial shredding).
636///
637/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
638#[derive(Clone, Debug)]
639pub struct ShreddingState {
640 value: Option<BinaryViewArray>,
641 typed_value: Option<ArrayRef>,
642}
643
644impl ShreddingState {
645 /// Create a new `ShreddingState` from the given `value` and `typed_value` fields
646 ///
647 /// Note you can create a `ShreddingState` from a &[`StructArray`] using
648 /// `ShreddingState::try_from(&struct_array)`, for example:
649 ///
650 /// ```no_run
651 /// # use arrow::array::StructArray;
652 /// # use parquet_variant_compute::ShreddingState;
653 /// # fn get_struct_array() -> StructArray {
654 /// # unimplemented!()
655 /// # }
656 /// let struct_array: StructArray = get_struct_array();
657 /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
658 /// ```
659 pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
660 Self { value, typed_value }
661 }
662
663 /// Return a reference to the value field, if present
664 pub fn value_field(&self) -> Option<&BinaryViewArray> {
665 self.value.as_ref()
666 }
667
668 /// Return a reference to the typed_value field, if present
669 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
670 self.typed_value.as_ref()
671 }
672
673 /// Returns a borrowed version of this shredding state
674 pub fn borrow(&self) -> BorrowedShreddingState<'_> {
675 BorrowedShreddingState {
676 value: self.value_field(),
677 typed_value: self.typed_value_field(),
678 }
679 }
680
681 /// Slice all the underlying arrays
682 pub fn slice(&self, offset: usize, length: usize) -> Self {
683 Self {
684 value: self.value.as_ref().map(|v| v.slice(offset, length)),
685 typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
686 }
687 }
688}
689
690/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
691/// for avoiding clone operations when the caller does not need a self-standing shredding state.
692#[derive(Clone, Debug)]
693pub struct BorrowedShreddingState<'a> {
694 value: Option<&'a BinaryViewArray>,
695 typed_value: Option<&'a ArrayRef>,
696}
697
698impl<'a> BorrowedShreddingState<'a> {
699 /// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
700 ///
701 /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
702 /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
703 ///
704 /// ```no_run
705 /// # use arrow::array::StructArray;
706 /// # use parquet_variant_compute::BorrowedShreddingState;
707 /// # fn get_struct_array() -> StructArray {
708 /// # unimplemented!()
709 /// # }
710 /// let struct_array: StructArray = get_struct_array();
711 /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
712 /// ```
713 pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
714 Self { value, typed_value }
715 }
716
717 /// Return a reference to the value field, if present
718 pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
719 self.value
720 }
721
722 /// Return a reference to the typed_value field, if present
723 pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
724 self.typed_value
725 }
726}
727
728impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
729 type Error = ArrowError;
730
731 fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
732 // The `value` column need not exist, but if it does it must be a binary view.
733 let value = if let Some(value_col) = inner_struct.column_by_name("value") {
734 let Some(binary_view) = value_col.as_binary_view_opt() else {
735 return Err(ArrowError::NotYetImplemented(format!(
736 "VariantArray 'value' field must be BinaryView, got {}",
737 value_col.data_type()
738 )));
739 };
740 Some(binary_view)
741 } else {
742 None
743 };
744 let typed_value = inner_struct.column_by_name("typed_value");
745 Ok(BorrowedShreddingState::new(value, typed_value))
746 }
747}
748
749impl TryFrom<&StructArray> for ShreddingState {
750 type Error = ArrowError;
751
752 fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
753 Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
754 }
755}
756
757impl From<BorrowedShreddingState<'_>> for ShreddingState {
758 fn from(state: BorrowedShreddingState<'_>) -> Self {
759 ShreddingState {
760 value: state.value_field().cloned(),
761 typed_value: state.typed_value_field().cloned(),
762 }
763 }
764}
765
766/// Builds struct arrays from component fields
767///
768/// TODO: move to arrow crate
769#[derive(Debug, Default, Clone)]
770pub(crate) struct StructArrayBuilder {
771 fields: Vec<FieldRef>,
772 arrays: Vec<ArrayRef>,
773 nulls: Option<NullBuffer>,
774}
775
776impl StructArrayBuilder {
777 pub fn new() -> Self {
778 Default::default()
779 }
780
781 /// Add an array to this struct array as a field with the specified name.
782 pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
783 let field = Field::new(field_name, array.data_type().clone(), nullable);
784 self.fields.push(Arc::new(field));
785 self.arrays.push(array);
786 self
787 }
788
789 /// Set the null buffer for this struct array.
790 pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
791 self.nulls = Some(nulls);
792 self
793 }
794
795 pub fn build(self) -> StructArray {
796 let Self {
797 fields,
798 arrays,
799 nulls,
800 } = self;
801 StructArray::new(Fields::from(fields), arrays, nulls)
802 }
803}
804
805/// returns the non-null element at index as a Variant
806fn typed_value_to_variant<'a>(
807 typed_value: &'a ArrayRef,
808 value: Option<&BinaryViewArray>,
809 index: usize,
810) -> Variant<'a, 'a> {
811 let data_type = typed_value.data_type();
812 if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
813 // Only a partially shredded struct is allowed to have values for both columns
814 panic!("Invalid variant, conflicting value and typed_value");
815 }
816 match data_type {
817 DataType::Boolean => {
818 let boolean_array = typed_value.as_boolean();
819 let value = boolean_array.value(index);
820 Variant::from(value)
821 }
822 DataType::Date32 => {
823 let array = typed_value.as_primitive::<Date32Type>();
824 let value = array.value(index);
825 let date = Date32Type::to_naive_date(value);
826 Variant::from(date)
827 }
828 // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
829 DataType::FixedSizeBinary(16) => {
830 let array = typed_value.as_fixed_size_binary();
831 let value = array.value(index);
832 Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice is always 16 bytes
833 }
834 DataType::BinaryView => {
835 let array = typed_value.as_binary_view();
836 let value = array.value(index);
837 Variant::from(value)
838 }
839 DataType::Utf8 => {
840 let array = typed_value.as_string::<i32>();
841 let value = array.value(index);
842 Variant::from(value)
843 }
844 DataType::Int8 => {
845 primitive_conversion_single_value!(Int8Type, typed_value, index)
846 }
847 DataType::Int16 => {
848 primitive_conversion_single_value!(Int16Type, typed_value, index)
849 }
850 DataType::Int32 => {
851 primitive_conversion_single_value!(Int32Type, typed_value, index)
852 }
853 DataType::Int64 => {
854 primitive_conversion_single_value!(Int64Type, typed_value, index)
855 }
856 DataType::Float16 => {
857 primitive_conversion_single_value!(Float16Type, typed_value, index)
858 }
859 DataType::Float32 => {
860 primitive_conversion_single_value!(Float32Type, typed_value, index)
861 }
862 DataType::Float64 => {
863 primitive_conversion_single_value!(Float64Type, typed_value, index)
864 }
865 DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
866 generic_conversion_single_value!(
867 TimestampMicrosecondType,
868 as_primitive,
869 |v| DateTime::from_timestamp_micros(v).unwrap(),
870 typed_value,
871 index
872 )
873 }
874 DataType::Timestamp(TimeUnit::Microsecond, None) => {
875 generic_conversion_single_value!(
876 TimestampMicrosecondType,
877 as_primitive,
878 |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
879 typed_value,
880 index
881 )
882 }
883 DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
884 generic_conversion_single_value!(
885 TimestampNanosecondType,
886 as_primitive,
887 DateTime::from_timestamp_nanos,
888 typed_value,
889 index
890 )
891 }
892 DataType::Timestamp(TimeUnit::Nanosecond, None) => {
893 generic_conversion_single_value!(
894 TimestampNanosecondType,
895 as_primitive,
896 |v| DateTime::from_timestamp_nanos(v).naive_utc(),
897 typed_value,
898 index
899 )
900 }
901 // todo other types here (note this is very similar to cast_to_variant.rs)
902 // so it would be great to figure out how to share this code
903 _ => {
904 // We shouldn't panic in production code, but this is a
905 // placeholder until we implement more types
906 // https://github.com/apache/arrow-rs/issues/8091
907 debug_assert!(
908 false,
909 "Unsupported typed_value type: {}",
910 typed_value.data_type()
911 );
912 Variant::Null
913 }
914 }
915}
916
917/// Workaround for lack of direct support for BinaryArray
918/// <https://github.com/apache/arrow-rs/issues/8387>
919///
920/// The values are read as
921/// * `StructArray<metadata: Binary, value: Binary>`
922///
923/// but VariantArray needs them as
924/// * `StructArray<metadata: BinaryView, value: BinaryView>`
925///
926/// So cast them to get the right type.
927fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
928 let new_type = canonicalize_and_verify_data_type(array.data_type())?;
929 cast(array, new_type.as_ref())
930}
931
932/// Validates whether a given arrow decimal is a valid variant decimal
933///
934/// NOTE: By a strict reading of the "decimal table" in the [shredding spec], each decimal type
935/// should have a width-dependent lower bound on precision as well as an upper bound (i.e. Decimal16
936/// with precision 5 is invalid because Decimal4 "covers" it). But the variant shredding integration
937/// tests specifically expect such cases to succeed, so we only enforce the upper bound here.
938///
939/// [shredding spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
940fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
941 (1..=max_precision).contains(p) && (0..=*p as i8).contains(s)
942}
943
944/// Recursively visits a data type, ensuring that it only contains data types that can legally
945/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView,
946/// since that's what comes back from the parquet reader and what the variant code expects to find.
947fn canonicalize_and_verify_data_type(
948 data_type: &DataType,
949) -> Result<Cow<'_, DataType>, ArrowError> {
950 use DataType::*;
951
952 // helper macros
953 macro_rules! fail {
954 () => {
955 return Err(ArrowError::InvalidArgumentError(format!(
956 "Illegal shredded value type: {data_type}"
957 )))
958 };
959 }
960 macro_rules! borrow {
961 () => {
962 Cow::Borrowed(data_type)
963 };
964 }
965
966 let new_data_type = match data_type {
967 // Primitive arrow types that have a direct variant counterpart are allowed
968 Null | Boolean => borrow!(),
969 Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
970
971 // Unsigned integers and half-float are not allowed
972 UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
973
974 // Most decimal types are allowed, with restrictions on precision and scale
975 Decimal32(p, s) if is_valid_variant_decimal(p, s, 9) => borrow!(),
976 Decimal64(p, s) if is_valid_variant_decimal(p, s, 18) => borrow!(),
977 Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(),
978 Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
979
980 // Only micro and nano timestamps are allowed
981 Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
982 Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
983
984 // Only 32-bit dates and 64-bit microsecond time are allowed.
985 Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
986 Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
987
988 // Binary and string are allowed. Force Binary to BinaryView because that's what the parquet
989 // reader returns and what the rest of the variant code expects.
990 Binary => Cow::Owned(DataType::BinaryView),
991 BinaryView | Utf8 => borrow!(),
992
993 // UUID maps to 16-byte fixed-size binary; no other width is allowed
994 FixedSizeBinary(16) => borrow!(),
995 FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
996
997 // We can _possibly_ allow (some of) these some day?
998 LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => {
999 fail!()
1000 }
1001
1002 // Lists and struct are allowed, maps and unions are not
1003 List(field) => match canonicalize_and_verify_field(field)? {
1004 Cow::Borrowed(_) => borrow!(),
1005 Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1006 },
1007 // Struct is used by the internal layout, and can also represent a shredded variant object.
1008 Struct(fields) => {
1009 // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning
1010 // of the data type. Even if some fields change, the others are shallow arc clones.
1011 let mut new_fields = std::collections::HashMap::new();
1012 for (i, field) in fields.iter().enumerate() {
1013 if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1014 new_fields.insert(i, new_field);
1015 }
1016 }
1017
1018 if new_fields.is_empty() {
1019 borrow!()
1020 } else {
1021 let new_fields = fields
1022 .iter()
1023 .enumerate()
1024 .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1025 Cow::Owned(DataType::Struct(new_fields.collect()))
1026 }
1027 }
1028 Map(..) | Union(..) => fail!(),
1029
1030 // We can _possibly_ support (some of) these some day?
1031 Dictionary(..) | RunEndEncoded(..) => fail!(),
1032 };
1033 Ok(new_data_type)
1034}
1035
1036fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>, ArrowError> {
1037 let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1038 return Ok(Cow::Borrowed(field));
1039 };
1040 let new_field = field.as_ref().clone().with_data_type(new_data_type);
1041 Ok(Cow::Owned(Arc::new(new_field)))
1042}
1043
1044#[cfg(test)]
1045mod test {
1046 use super::*;
1047 use arrow::array::{BinaryViewArray, Int32Array};
1048 use arrow_schema::{Field, Fields};
1049
1050 #[test]
1051 fn invalid_not_a_struct_array() {
1052 let array = make_binary_view_array();
1053 // Should fail because the input is not a StructArray
1054 let err = VariantArray::try_new(&array);
1055 assert_eq!(
1056 err.unwrap_err().to_string(),
1057 "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1058 );
1059 }
1060
1061 #[test]
1062 fn invalid_missing_metadata() {
1063 let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1064 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1065 // Should fail because the StructArray does not contain a 'metadata' field
1066 let err = VariantArray::try_new(&array);
1067 assert_eq!(
1068 err.unwrap_err().to_string(),
1069 "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1070 );
1071 }
1072
1073 #[test]
1074 fn all_null_missing_value_and_typed_value() {
1075 let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1076 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1077
1078 // NOTE: By strict spec interpretation, this case (top-level variant with null/null)
1079 // should be invalid, but we currently allow it and treat it as Variant::Null.
1080 // This is a pragmatic decision to handle missing data gracefully.
1081 let variant_array = VariantArray::try_new(&array).unwrap();
1082
1083 // Verify the shredding state is AllNull
1084 assert!(matches!(
1085 variant_array.shredding_state(),
1086 ShreddingState {
1087 value: None,
1088 typed_value: None
1089 }
1090 ));
1091
1092 // Verify that value() returns Variant::Null (compensating for spec violation)
1093 for i in 0..variant_array.len() {
1094 if variant_array.is_valid(i) {
1095 assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1096 }
1097 }
1098 }
1099
1100 #[test]
1101 fn invalid_metadata_field_type() {
1102 let fields = Fields::from(vec![
1103 Field::new("metadata", DataType::Int32, true), // not supported
1104 Field::new("value", DataType::BinaryView, true),
1105 ]);
1106 let array = StructArray::new(
1107 fields,
1108 vec![make_int32_array(), make_binary_view_array()],
1109 None,
1110 );
1111 let err = VariantArray::try_new(&array);
1112 assert_eq!(
1113 err.unwrap_err().to_string(),
1114 "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32"
1115 );
1116 }
1117
1118 #[test]
1119 fn invalid_value_field_type() {
1120 let fields = Fields::from(vec![
1121 Field::new("metadata", DataType::BinaryView, true),
1122 Field::new("value", DataType::Int32, true), // Not yet supported
1123 ]);
1124 let array = StructArray::new(
1125 fields,
1126 vec![make_binary_view_array(), make_int32_array()],
1127 None,
1128 );
1129 let err = VariantArray::try_new(&array);
1130 assert_eq!(
1131 err.unwrap_err().to_string(),
1132 "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32"
1133 );
1134 }
1135
1136 fn make_binary_view_array() -> ArrayRef {
1137 Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1138 }
1139
1140 fn make_int32_array() -> ArrayRef {
1141 Arc::new(Int32Array::from(vec![1]))
1142 }
1143
1144 #[test]
1145 fn all_null_shredding_state() {
1146 // Verify the shredding state is AllNull
1147 assert!(matches!(
1148 ShreddingState::new(None, None),
1149 ShreddingState {
1150 value: None,
1151 typed_value: None
1152 }
1153 ));
1154 }
1155
1156 #[test]
1157 fn all_null_variant_array_construction() {
1158 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1159 let nulls = NullBuffer::from(vec![false, false, false]); // all null
1160
1161 let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1162 let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1163
1164 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1165
1166 // Verify the shredding state is AllNull
1167 assert!(matches!(
1168 variant_array.shredding_state(),
1169 ShreddingState {
1170 value: None,
1171 typed_value: None
1172 }
1173 ));
1174
1175 // Verify all values are null
1176 assert_eq!(variant_array.len(), 3);
1177 assert!(!variant_array.is_valid(0));
1178 assert!(!variant_array.is_valid(1));
1179 assert!(!variant_array.is_valid(2));
1180
1181 // Verify that value() returns Variant::Null for all indices
1182 for i in 0..variant_array.len() {
1183 assert!(
1184 !variant_array.is_valid(i),
1185 "Expected value at index {i} to be null"
1186 );
1187 }
1188 }
1189
1190 #[test]
1191 fn value_field_present_but_all_null_should_be_unshredded() {
1192 // This test demonstrates the issue: when a value field exists in schema
1193 // but all its values are null, it should remain Unshredded, not AllNull
1194 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1195
1196 // Create a value field with all null values
1197 let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
1198 let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1199 let value_data = value_array
1200 .to_data()
1201 .into_builder()
1202 .nulls(Some(value_nulls))
1203 .build()
1204 .unwrap();
1205 let value = BinaryViewArray::from(value_data);
1206
1207 let fields = Fields::from(vec![
1208 Field::new("metadata", DataType::BinaryView, false),
1209 Field::new("value", DataType::BinaryView, true), // Field exists in schema
1210 ]);
1211 let struct_array = StructArray::new(
1212 fields,
1213 vec![Arc::new(metadata), Arc::new(value)],
1214 None, // struct itself is not null, just the value field is all null
1215 );
1216
1217 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1218
1219 // This should be Unshredded, not AllNull, because value field exists in schema
1220 assert!(matches!(
1221 variant_array.shredding_state(),
1222 ShreddingState {
1223 value: Some(_),
1224 typed_value: None
1225 }
1226 ));
1227 }
1228}