parquet_variant_compute/
variant_to_arrow.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow::array::{
19    ArrayRef, BinaryViewArray, BooleanBuilder, NullArray, NullBufferBuilder, PrimitiveBuilder,
20};
21use arrow::compute::{CastOptions, DecimalCast};
22use arrow::datatypes::{self, DataType, DecimalType};
23use arrow::error::{ArrowError, Result};
24use parquet_variant::{Variant, VariantPath};
25
26use crate::type_conversion::{
27    PrimitiveFromVariant, TimestampFromVariant, variant_to_unscaled_decimal,
28};
29use crate::{VariantArray, VariantValueArrayBuilder};
30
31use arrow_schema::TimeUnit;
32use std::sync::Arc;
33
34/// Builder for converting variant values to primitive Arrow arrays. It is used by both
35/// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in
36/// `shred_variant.rs`).
37pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
38    Null(VariantToNullArrowRowBuilder<'a>),
39    Boolean(VariantToBooleanArrowRowBuilder<'a>),
40    Int8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int8Type>),
41    Int16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int16Type>),
42    Int32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int32Type>),
43    Int64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int64Type>),
44    UInt8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt8Type>),
45    UInt16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt16Type>),
46    UInt32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt32Type>),
47    UInt64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt64Type>),
48    Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>),
49    Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>),
50    Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>),
51    Decimal32(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal32Type>),
52    Decimal64(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal64Type>),
53    Decimal128(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal128Type>),
54    Decimal256(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal256Type>),
55    TimestampMicro(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>),
56    TimestampMicroNtz(
57        VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>,
58    ),
59    TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
60    TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
61    Time(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64MicrosecondType>),
62    Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
63}
64
65/// Builder for converting variant values into strongly typed Arrow arrays.
66///
67/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly
68/// with casting of leaf values to specific types.
69pub(crate) enum VariantToArrowRowBuilder<'a> {
70    Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
71    BinaryVariant(VariantToBinaryVariantArrowRowBuilder),
72
73    // Path extraction wrapper - contains a boxed enum for any of the above
74    WithPath(VariantPathRowBuilder<'a>),
75}
76
77impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
78    pub fn append_null(&mut self) -> Result<()> {
79        use PrimitiveVariantToArrowRowBuilder::*;
80        match self {
81            Null(b) => b.append_null(),
82            Boolean(b) => b.append_null(),
83            Int8(b) => b.append_null(),
84            Int16(b) => b.append_null(),
85            Int32(b) => b.append_null(),
86            Int64(b) => b.append_null(),
87            UInt8(b) => b.append_null(),
88            UInt16(b) => b.append_null(),
89            UInt32(b) => b.append_null(),
90            UInt64(b) => b.append_null(),
91            Float16(b) => b.append_null(),
92            Float32(b) => b.append_null(),
93            Float64(b) => b.append_null(),
94            Decimal32(b) => b.append_null(),
95            Decimal64(b) => b.append_null(),
96            Decimal128(b) => b.append_null(),
97            Decimal256(b) => b.append_null(),
98            TimestampMicro(b) => b.append_null(),
99            TimestampMicroNtz(b) => b.append_null(),
100            TimestampNano(b) => b.append_null(),
101            TimestampNanoNtz(b) => b.append_null(),
102            Time(b) => b.append_null(),
103            Date(b) => b.append_null(),
104        }
105    }
106
107    pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
108        use PrimitiveVariantToArrowRowBuilder::*;
109        match self {
110            Null(b) => b.append_value(value),
111            Boolean(b) => b.append_value(value),
112            Int8(b) => b.append_value(value),
113            Int16(b) => b.append_value(value),
114            Int32(b) => b.append_value(value),
115            Int64(b) => b.append_value(value),
116            UInt8(b) => b.append_value(value),
117            UInt16(b) => b.append_value(value),
118            UInt32(b) => b.append_value(value),
119            UInt64(b) => b.append_value(value),
120            Float16(b) => b.append_value(value),
121            Float32(b) => b.append_value(value),
122            Float64(b) => b.append_value(value),
123            Decimal32(b) => b.append_value(value),
124            Decimal64(b) => b.append_value(value),
125            Decimal128(b) => b.append_value(value),
126            Decimal256(b) => b.append_value(value),
127            TimestampMicro(b) => b.append_value(value),
128            TimestampMicroNtz(b) => b.append_value(value),
129            TimestampNano(b) => b.append_value(value),
130            TimestampNanoNtz(b) => b.append_value(value),
131            Time(b) => b.append_value(value),
132            Date(b) => b.append_value(value),
133        }
134    }
135
136    pub fn finish(self) -> Result<ArrayRef> {
137        use PrimitiveVariantToArrowRowBuilder::*;
138        match self {
139            Null(b) => b.finish(),
140            Boolean(b) => b.finish(),
141            Int8(b) => b.finish(),
142            Int16(b) => b.finish(),
143            Int32(b) => b.finish(),
144            Int64(b) => b.finish(),
145            UInt8(b) => b.finish(),
146            UInt16(b) => b.finish(),
147            UInt32(b) => b.finish(),
148            UInt64(b) => b.finish(),
149            Float16(b) => b.finish(),
150            Float32(b) => b.finish(),
151            Float64(b) => b.finish(),
152            Decimal32(b) => b.finish(),
153            Decimal64(b) => b.finish(),
154            Decimal128(b) => b.finish(),
155            Decimal256(b) => b.finish(),
156            TimestampMicro(b) => b.finish(),
157            TimestampMicroNtz(b) => b.finish(),
158            TimestampNano(b) => b.finish(),
159            TimestampNanoNtz(b) => b.finish(),
160            Time(b) => b.finish(),
161            Date(b) => b.finish(),
162        }
163    }
164}
165
166impl<'a> VariantToArrowRowBuilder<'a> {
167    pub fn append_null(&mut self) -> Result<()> {
168        use VariantToArrowRowBuilder::*;
169        match self {
170            Primitive(b) => b.append_null(),
171            BinaryVariant(b) => b.append_null(),
172            WithPath(path_builder) => path_builder.append_null(),
173        }
174    }
175
176    pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
177        use VariantToArrowRowBuilder::*;
178        match self {
179            Primitive(b) => b.append_value(&value),
180            BinaryVariant(b) => b.append_value(value),
181            WithPath(path_builder) => path_builder.append_value(value),
182        }
183    }
184
185    pub fn finish(self) -> Result<ArrayRef> {
186        use VariantToArrowRowBuilder::*;
187        match self {
188            Primitive(b) => b.finish(),
189            BinaryVariant(b) => b.finish(),
190            WithPath(path_builder) => path_builder.finish(),
191        }
192    }
193}
194
195/// Creates a primitive row builder, returning Err if the requested data type is not primitive.
196pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
197    data_type: &'a DataType,
198    cast_options: &'a CastOptions,
199    capacity: usize,
200) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
201    use PrimitiveVariantToArrowRowBuilder::*;
202
203    let builder =
204        match data_type {
205            DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)),
206            DataType::Boolean => {
207                Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity))
208            }
209            DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
210                cast_options,
211                capacity,
212            )),
213            DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
214                cast_options,
215                capacity,
216            )),
217            DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
218                cast_options,
219                capacity,
220            )),
221            DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
222                cast_options,
223                capacity,
224            )),
225            DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
226                cast_options,
227                capacity,
228            )),
229            DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
230                cast_options,
231                capacity,
232            )),
233            DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
234                cast_options,
235                capacity,
236            )),
237            DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
238                cast_options,
239                capacity,
240            )),
241            DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
242                cast_options,
243                capacity,
244            )),
245            DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
246                cast_options,
247                capacity,
248            )),
249            DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
250                cast_options,
251                capacity,
252            )),
253            DataType::Decimal32(precision, scale) => Decimal32(
254                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
255            ),
256            DataType::Decimal64(precision, scale) => Decimal64(
257                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
258            ),
259            DataType::Decimal128(precision, scale) => Decimal128(
260                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
261            ),
262            DataType::Decimal256(precision, scale) => Decimal256(
263                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
264            ),
265            DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
266                VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
267            ),
268            DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
269                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
270            ),
271            DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
272                VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
273            ),
274            DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
275                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
276            ),
277            DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
278                cast_options,
279                capacity,
280            )),
281            DataType::Time64(TimeUnit::Microsecond) => Time(
282                VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
283            ),
284            _ if data_type.is_primitive() => {
285                return Err(ArrowError::NotYetImplemented(format!(
286                    "Primitive data_type {data_type:?} not yet implemented"
287                )));
288            }
289            _ => {
290                return Err(ArrowError::InvalidArgumentError(format!(
291                    "Not a primitive type: {data_type:?}"
292                )));
293            }
294        };
295    Ok(builder)
296}
297
298pub(crate) fn make_variant_to_arrow_row_builder<'a>(
299    metadata: &BinaryViewArray,
300    path: VariantPath<'a>,
301    data_type: Option<&'a DataType>,
302    cast_options: &'a CastOptions,
303    capacity: usize,
304) -> Result<VariantToArrowRowBuilder<'a>> {
305    use VariantToArrowRowBuilder::*;
306
307    let mut builder = match data_type {
308        // If no data type was requested, build an unshredded VariantArray.
309        None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(
310            metadata.clone(),
311            capacity,
312        )),
313        Some(DataType::Struct(_)) => {
314            return Err(ArrowError::NotYetImplemented(
315                "Converting unshredded variant objects to arrow structs".to_string(),
316            ));
317        }
318        Some(
319            DataType::List(_)
320            | DataType::LargeList(_)
321            | DataType::ListView(_)
322            | DataType::LargeListView(_)
323            | DataType::FixedSizeList(..),
324        ) => {
325            return Err(ArrowError::NotYetImplemented(
326                "Converting unshredded variant arrays to arrow lists".to_string(),
327            ));
328        }
329        Some(data_type) => {
330            let builder =
331                make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
332            Primitive(builder)
333        }
334    };
335
336    // Wrap with path extraction if needed
337    if !path.is_empty() {
338        builder = WithPath(VariantPathRowBuilder {
339            builder: Box::new(builder),
340            path,
341        })
342    };
343
344    Ok(builder)
345}
346
347/// A thin wrapper whose only job is to extract a specific path from a variant value and pass the
348/// result to a nested builder.
349pub(crate) struct VariantPathRowBuilder<'a> {
350    builder: Box<VariantToArrowRowBuilder<'a>>,
351    path: VariantPath<'a>,
352}
353
354impl<'a> VariantPathRowBuilder<'a> {
355    fn append_null(&mut self) -> Result<()> {
356        self.builder.append_null()
357    }
358
359    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
360        if let Some(v) = value.get_path(&self.path) {
361            self.builder.append_value(v)
362        } else {
363            self.builder.append_null()?;
364            Ok(false)
365        }
366    }
367
368    fn finish(self) -> Result<ArrayRef> {
369        self.builder.finish()
370    }
371}
372
373macro_rules! define_variant_to_primitive_builder {
374    (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
375    |$array_param:ident $(, $field:ident: $field_type:ty)?| -> $builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
376    |$value: ident| $value_transform:expr,
377    type_name: $type_name:expr) => {
378        pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
379        {
380            builder: $builder_name $(<$array_type>)?,
381            cast_options: &$lifetime CastOptions<$lifetime>,
382        }
383
384        impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic )?> {
385            fn new(
386                cast_options: &$lifetime CastOptions<$lifetime>,
387                $array_param: usize,
388                // add this so that $init_expr can use it
389                $( $field: $field_type, )?
390            ) -> Self {
391                Self {
392                    builder: $init_expr,
393                    cast_options,
394                }
395            }
396
397            fn append_null(&mut self) -> Result<()> {
398                self.builder.append_null();
399                Ok(())
400            }
401
402            fn append_value(&mut self, $value: &Variant<'_, '_>) -> Result<bool> {
403                if let Some(v) = $value_transform {
404                    self.builder.append_value(v);
405                    Ok(true)
406                } else {
407                    if !self.cast_options.safe {
408                        // Unsafe casting: return error on conversion failure
409                        return Err(ArrowError::CastError(format!(
410                            "Failed to extract primitive of type {} from variant {:?} at path VariantPath([])",
411                            $type_name,
412                            $value
413                        )));
414                    }
415                    // Safe casting: append null on conversion failure
416                    self.builder.append_null();
417                    Ok(false)
418                }
419            }
420
421            // Add this to silence unused mut warning from macro-generated code
422            // This is mainly for `FakeNullBuilder`
423            #[allow(unused_mut)]
424            fn finish(mut self) -> Result<ArrayRef> {
425                Ok(Arc::new(self.builder.finish()))
426            }
427        }
428    }
429}
430
431define_variant_to_primitive_builder!(
432    struct VariantToBooleanArrowRowBuilder<'a>
433    |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
434    |value|  value.as_boolean(),
435    type_name: datatypes::BooleanType::DATA_TYPE
436);
437
438define_variant_to_primitive_builder!(
439    struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
440    |capacity| -> PrimitiveBuilder<T> { PrimitiveBuilder::<T>::with_capacity(capacity) },
441    |value| T::from_variant(value),
442    type_name: T::DATA_TYPE
443);
444
445define_variant_to_primitive_builder!(
446    struct VariantToTimestampNtzArrowRowBuilder<'a, T:TimestampFromVariant<true>>
447    |capacity| -> PrimitiveBuilder<T> { PrimitiveBuilder::<T>::with_capacity(capacity) },
448    |value| T::from_variant(value),
449    type_name: T::DATA_TYPE
450);
451
452define_variant_to_primitive_builder!(
453    struct VariantToTimestampArrowRowBuilder<'a, T:TimestampFromVariant<false>>
454    |capacity, tz: Option<Arc<str>> | -> PrimitiveBuilder<T> {
455        PrimitiveBuilder::<T>::with_capacity(capacity).with_timezone_opt(tz)
456    },
457    |value| T::from_variant(value),
458    type_name: T::DATA_TYPE
459);
460
461/// Builder for converting variant values to arrow Decimal values
462pub(crate) struct VariantToDecimalArrowRowBuilder<'a, T>
463where
464    T: DecimalType,
465    T::Native: DecimalCast,
466{
467    builder: PrimitiveBuilder<T>,
468    cast_options: &'a CastOptions<'a>,
469    precision: u8,
470    scale: i8,
471}
472
473impl<'a, T> VariantToDecimalArrowRowBuilder<'a, T>
474where
475    T: DecimalType,
476    T::Native: DecimalCast,
477{
478    fn new(
479        cast_options: &'a CastOptions<'a>,
480        capacity: usize,
481        precision: u8,
482        scale: i8,
483    ) -> Result<Self> {
484        let builder = PrimitiveBuilder::<T>::with_capacity(capacity)
485            .with_precision_and_scale(precision, scale)?;
486        Ok(Self {
487            builder,
488            cast_options,
489            precision,
490            scale,
491        })
492    }
493
494    fn append_null(&mut self) -> Result<()> {
495        self.builder.append_null();
496        Ok(())
497    }
498
499    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
500        if let Some(scaled) = variant_to_unscaled_decimal::<T>(value, self.precision, self.scale) {
501            self.builder.append_value(scaled);
502            Ok(true)
503        } else if self.cast_options.safe {
504            self.builder.append_null();
505            Ok(false)
506        } else {
507            Err(ArrowError::CastError(format!(
508                "Failed to cast to {}(precision={}, scale={}) from variant {:?}",
509                T::PREFIX,
510                self.precision,
511                self.scale,
512                value
513            )))
514        }
515    }
516
517    fn finish(mut self) -> Result<ArrayRef> {
518        Ok(Arc::new(self.builder.finish()))
519    }
520}
521
522/// Builder for creating VariantArray output (for path extraction without type conversion)
523pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
524    metadata: BinaryViewArray,
525    builder: VariantValueArrayBuilder,
526    nulls: NullBufferBuilder,
527}
528
529impl VariantToBinaryVariantArrowRowBuilder {
530    fn new(metadata: BinaryViewArray, capacity: usize) -> Self {
531        Self {
532            metadata,
533            builder: VariantValueArrayBuilder::new(capacity),
534            nulls: NullBufferBuilder::new(capacity),
535        }
536    }
537}
538
539impl VariantToBinaryVariantArrowRowBuilder {
540    fn append_null(&mut self) -> Result<()> {
541        self.builder.append_null();
542        self.nulls.append_null();
543        Ok(())
544    }
545
546    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
547        self.builder.append_value(value);
548        self.nulls.append_non_null();
549        Ok(true)
550    }
551
552    fn finish(mut self) -> Result<ArrayRef> {
553        let variant_array = VariantArray::from_parts(
554            self.metadata,
555            Some(self.builder.build()?),
556            None, // no typed_value column
557            self.nulls.finish(),
558        );
559
560        Ok(ArrayRef::from(variant_array))
561    }
562}
563
564struct FakeNullBuilder(NullArray);
565
566impl FakeNullBuilder {
567    fn new(capacity: usize) -> Self {
568        Self(NullArray::new(capacity))
569    }
570    fn append_value<T>(&mut self, _: T) {}
571    fn append_null(&mut self) {}
572
573    fn finish(self) -> NullArray {
574        self.0
575    }
576}
577
578define_variant_to_primitive_builder!(
579    struct VariantToNullArrowRowBuilder<'a>
580    |capacity| -> FakeNullBuilder { FakeNullBuilder::new(capacity) },
581    |_value|  Some(Variant::Null),
582    type_name: "Null"
583);