Skip to main content

parquet_variant_compute/
variant_to_arrow.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::shred_variant::{
19    VariantToShreddedVariantRowBuilder, make_variant_to_shredded_variant_arrow_row_builder,
20};
21use crate::type_conversion::{
22    PrimitiveFromVariant, TimestampFromVariant, variant_to_unscaled_decimal,
23};
24use crate::variant_array::ShreddedVariantFieldArray;
25use crate::{VariantArray, VariantValueArrayBuilder};
26use arrow::array::{
27    ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray,
28    BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray,
29    GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder,
30    OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder,
31};
32use arrow::buffer::{OffsetBuffer, ScalarBuffer};
33use arrow::compute::{CastOptions, DecimalCast};
34use arrow::datatypes::{self, DataType, DecimalType};
35use arrow::error::{ArrowError, Result};
36use arrow_schema::{FieldRef, TimeUnit};
37use parquet_variant::{Variant, VariantPath};
38use std::sync::Arc;
39
40/// Builder for converting variant values into strongly typed Arrow arrays.
41///
42/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly
43/// with casting of leaf values to specific types.
44pub(crate) enum VariantToArrowRowBuilder<'a> {
45    Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
46    Array(ArrayVariantToArrowRowBuilder<'a>),
47    BinaryVariant(VariantToBinaryVariantArrowRowBuilder),
48
49    // Path extraction wrapper - contains a boxed enum for any of the above
50    WithPath(VariantPathRowBuilder<'a>),
51}
52
53impl<'a> VariantToArrowRowBuilder<'a> {
54    pub fn append_null(&mut self) -> Result<()> {
55        use VariantToArrowRowBuilder::*;
56        match self {
57            Primitive(b) => b.append_null(),
58            Array(b) => b.append_null(),
59            BinaryVariant(b) => b.append_null(),
60            WithPath(path_builder) => path_builder.append_null(),
61        }
62    }
63
64    pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
65        use VariantToArrowRowBuilder::*;
66        match self {
67            Primitive(b) => b.append_value(&value),
68            Array(b) => b.append_value(&value),
69            BinaryVariant(b) => b.append_value(value),
70            WithPath(path_builder) => path_builder.append_value(value),
71        }
72    }
73
74    pub fn finish(self) -> Result<ArrayRef> {
75        use VariantToArrowRowBuilder::*;
76        match self {
77            Primitive(b) => b.finish(),
78            Array(b) => b.finish(),
79            BinaryVariant(b) => b.finish(),
80            WithPath(path_builder) => path_builder.finish(),
81        }
82    }
83}
84
85pub(crate) fn make_variant_to_arrow_row_builder<'a>(
86    metadata: &BinaryViewArray,
87    path: VariantPath<'a>,
88    data_type: Option<&'a DataType>,
89    cast_options: &'a CastOptions,
90    capacity: usize,
91) -> Result<VariantToArrowRowBuilder<'a>> {
92    use VariantToArrowRowBuilder::*;
93
94    let mut builder = match data_type {
95        // If no data type was requested, build an unshredded VariantArray.
96        None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(
97            metadata.clone(),
98            capacity,
99        )),
100        Some(DataType::Struct(_)) => {
101            return Err(ArrowError::NotYetImplemented(
102                "Converting unshredded variant objects to arrow structs".to_string(),
103            ));
104        }
105        Some(
106            data_type @ (DataType::List(_)
107            | DataType::LargeList(_)
108            | DataType::ListView(_)
109            | DataType::LargeListView(_)
110            | DataType::FixedSizeList(..)),
111        ) => {
112            let builder =
113                ArrayVariantToArrowRowBuilder::try_new(data_type, cast_options, capacity)?;
114            Array(builder)
115        }
116        Some(data_type) => {
117            let builder =
118                make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
119            Primitive(builder)
120        }
121    };
122
123    // Wrap with path extraction if needed
124    if !path.is_empty() {
125        builder = WithPath(VariantPathRowBuilder {
126            builder: Box::new(builder),
127            path,
128        })
129    };
130
131    Ok(builder)
132}
133
134/// Builder for converting primitive variant values to Arrow arrays. It is used by both
135/// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in
136/// `shred_variant.rs`).
137pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
138    Null(VariantToNullArrowRowBuilder<'a>),
139    Boolean(VariantToBooleanArrowRowBuilder<'a>),
140    Int8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int8Type>),
141    Int16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int16Type>),
142    Int32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int32Type>),
143    Int64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int64Type>),
144    UInt8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt8Type>),
145    UInt16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt16Type>),
146    UInt32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt32Type>),
147    UInt64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt64Type>),
148    Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>),
149    Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>),
150    Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>),
151    Decimal32(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal32Type>),
152    Decimal64(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal64Type>),
153    Decimal128(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal128Type>),
154    Decimal256(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal256Type>),
155    TimestampSecond(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampSecondType>),
156    TimestampSecondNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampSecondType>),
157    TimestampMilli(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMillisecondType>),
158    TimestampMilliNtz(
159        VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampMillisecondType>,
160    ),
161    TimestampMicro(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>),
162    TimestampMicroNtz(
163        VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>,
164    ),
165    TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
166    TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
167    Time32Second(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time32SecondType>),
168    Time32Milli(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time32MillisecondType>),
169    Time64Micro(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64MicrosecondType>),
170    Time64Nano(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64NanosecondType>),
171    Date32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
172    Date64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date64Type>),
173    Uuid(VariantToUuidArrowRowBuilder<'a>),
174    String(VariantToStringArrowBuilder<'a, StringBuilder>),
175    LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>),
176    StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>),
177    Binary(VariantToBinaryArrowRowBuilder<'a, BinaryBuilder>),
178    LargeBinary(VariantToBinaryArrowRowBuilder<'a, LargeBinaryBuilder>),
179    BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>),
180}
181
182impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
183    pub fn append_null(&mut self) -> Result<()> {
184        use PrimitiveVariantToArrowRowBuilder::*;
185        match self {
186            Null(b) => b.append_null(),
187            Boolean(b) => b.append_null(),
188            Int8(b) => b.append_null(),
189            Int16(b) => b.append_null(),
190            Int32(b) => b.append_null(),
191            Int64(b) => b.append_null(),
192            UInt8(b) => b.append_null(),
193            UInt16(b) => b.append_null(),
194            UInt32(b) => b.append_null(),
195            UInt64(b) => b.append_null(),
196            Float16(b) => b.append_null(),
197            Float32(b) => b.append_null(),
198            Float64(b) => b.append_null(),
199            Decimal32(b) => b.append_null(),
200            Decimal64(b) => b.append_null(),
201            Decimal128(b) => b.append_null(),
202            Decimal256(b) => b.append_null(),
203            TimestampSecond(b) => b.append_null(),
204            TimestampSecondNtz(b) => b.append_null(),
205            TimestampMilli(b) => b.append_null(),
206            TimestampMilliNtz(b) => b.append_null(),
207            TimestampMicro(b) => b.append_null(),
208            TimestampMicroNtz(b) => b.append_null(),
209            TimestampNano(b) => b.append_null(),
210            TimestampNanoNtz(b) => b.append_null(),
211            Time32Second(b) => b.append_null(),
212            Time32Milli(b) => b.append_null(),
213            Time64Micro(b) => b.append_null(),
214            Time64Nano(b) => b.append_null(),
215            Date32(b) => b.append_null(),
216            Date64(b) => b.append_null(),
217            Uuid(b) => b.append_null(),
218            String(b) => b.append_null(),
219            LargeString(b) => b.append_null(),
220            StringView(b) => b.append_null(),
221            Binary(b) => b.append_null(),
222            LargeBinary(b) => b.append_null(),
223            BinaryView(b) => b.append_null(),
224        }
225    }
226
227    pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
228        use PrimitiveVariantToArrowRowBuilder::*;
229        match self {
230            Null(b) => b.append_value(value),
231            Boolean(b) => b.append_value(value),
232            Int8(b) => b.append_value(value),
233            Int16(b) => b.append_value(value),
234            Int32(b) => b.append_value(value),
235            Int64(b) => b.append_value(value),
236            UInt8(b) => b.append_value(value),
237            UInt16(b) => b.append_value(value),
238            UInt32(b) => b.append_value(value),
239            UInt64(b) => b.append_value(value),
240            Float16(b) => b.append_value(value),
241            Float32(b) => b.append_value(value),
242            Float64(b) => b.append_value(value),
243            Decimal32(b) => b.append_value(value),
244            Decimal64(b) => b.append_value(value),
245            Decimal128(b) => b.append_value(value),
246            Decimal256(b) => b.append_value(value),
247            TimestampSecond(b) => b.append_value(value),
248            TimestampSecondNtz(b) => b.append_value(value),
249            TimestampMilli(b) => b.append_value(value),
250            TimestampMilliNtz(b) => b.append_value(value),
251            TimestampMicro(b) => b.append_value(value),
252            TimestampMicroNtz(b) => b.append_value(value),
253            TimestampNano(b) => b.append_value(value),
254            TimestampNanoNtz(b) => b.append_value(value),
255            Time32Second(b) => b.append_value(value),
256            Time32Milli(b) => b.append_value(value),
257            Time64Micro(b) => b.append_value(value),
258            Time64Nano(b) => b.append_value(value),
259            Date32(b) => b.append_value(value),
260            Date64(b) => b.append_value(value),
261            Uuid(b) => b.append_value(value),
262            String(b) => b.append_value(value),
263            LargeString(b) => b.append_value(value),
264            StringView(b) => b.append_value(value),
265            Binary(b) => b.append_value(value),
266            LargeBinary(b) => b.append_value(value),
267            BinaryView(b) => b.append_value(value),
268        }
269    }
270
271    pub fn finish(self) -> Result<ArrayRef> {
272        use PrimitiveVariantToArrowRowBuilder::*;
273        match self {
274            Null(b) => b.finish(),
275            Boolean(b) => b.finish(),
276            Int8(b) => b.finish(),
277            Int16(b) => b.finish(),
278            Int32(b) => b.finish(),
279            Int64(b) => b.finish(),
280            UInt8(b) => b.finish(),
281            UInt16(b) => b.finish(),
282            UInt32(b) => b.finish(),
283            UInt64(b) => b.finish(),
284            Float16(b) => b.finish(),
285            Float32(b) => b.finish(),
286            Float64(b) => b.finish(),
287            Decimal32(b) => b.finish(),
288            Decimal64(b) => b.finish(),
289            Decimal128(b) => b.finish(),
290            Decimal256(b) => b.finish(),
291            TimestampSecond(b) => b.finish(),
292            TimestampSecondNtz(b) => b.finish(),
293            TimestampMilli(b) => b.finish(),
294            TimestampMilliNtz(b) => b.finish(),
295            TimestampMicro(b) => b.finish(),
296            TimestampMicroNtz(b) => b.finish(),
297            TimestampNano(b) => b.finish(),
298            TimestampNanoNtz(b) => b.finish(),
299            Time32Second(b) => b.finish(),
300            Time32Milli(b) => b.finish(),
301            Time64Micro(b) => b.finish(),
302            Time64Nano(b) => b.finish(),
303            Date32(b) => b.finish(),
304            Date64(b) => b.finish(),
305            Uuid(b) => b.finish(),
306            String(b) => b.finish(),
307            LargeString(b) => b.finish(),
308            StringView(b) => b.finish(),
309            Binary(b) => b.finish(),
310            LargeBinary(b) => b.finish(),
311            BinaryView(b) => b.finish(),
312        }
313    }
314}
315
316/// Creates a row builder that converts primitive `Variant` values into the requested Arrow data type.
317pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
318    data_type: &'a DataType,
319    cast_options: &'a CastOptions,
320    capacity: usize,
321) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
322    use PrimitiveVariantToArrowRowBuilder::*;
323
324    let builder =
325        match data_type {
326            DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)),
327            DataType::Boolean => {
328                Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity))
329            }
330            DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
331                cast_options,
332                capacity,
333            )),
334            DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
335                cast_options,
336                capacity,
337            )),
338            DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
339                cast_options,
340                capacity,
341            )),
342            DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
343                cast_options,
344                capacity,
345            )),
346            DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
347                cast_options,
348                capacity,
349            )),
350            DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
351                cast_options,
352                capacity,
353            )),
354            DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
355                cast_options,
356                capacity,
357            )),
358            DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
359                cast_options,
360                capacity,
361            )),
362            DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
363                cast_options,
364                capacity,
365            )),
366            DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
367                cast_options,
368                capacity,
369            )),
370            DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
371                cast_options,
372                capacity,
373            )),
374            DataType::Decimal32(precision, scale) => Decimal32(
375                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
376            ),
377            DataType::Decimal64(precision, scale) => Decimal64(
378                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
379            ),
380            DataType::Decimal128(precision, scale) => Decimal128(
381                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
382            ),
383            DataType::Decimal256(precision, scale) => Decimal256(
384                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
385            ),
386            DataType::Date32 => Date32(VariantToPrimitiveArrowRowBuilder::new(
387                cast_options,
388                capacity,
389            )),
390            DataType::Date64 => Date64(VariantToPrimitiveArrowRowBuilder::new(
391                cast_options,
392                capacity,
393            )),
394            DataType::Time32(TimeUnit::Second) => Time32Second(
395                VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
396            ),
397            DataType::Time32(TimeUnit::Millisecond) => Time32Milli(
398                VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
399            ),
400            DataType::Time32(t) => {
401                return Err(ArrowError::InvalidArgumentError(format!(
402                    "The unit for Time32 must be second/millisecond, received {t:?}"
403                )));
404            }
405            DataType::Time64(TimeUnit::Microsecond) => Time64Micro(
406                VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
407            ),
408            DataType::Time64(TimeUnit::Nanosecond) => Time64Nano(
409                VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
410            ),
411            DataType::Time64(t) => {
412                return Err(ArrowError::InvalidArgumentError(format!(
413                    "The unit for Time64 must be micro/nano seconds, received {t:?}"
414                )));
415            }
416            DataType::Timestamp(TimeUnit::Second, None) => TimestampSecondNtz(
417                VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
418            ),
419            DataType::Timestamp(TimeUnit::Second, tz) => TimestampSecond(
420                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
421            ),
422            DataType::Timestamp(TimeUnit::Millisecond, None) => TimestampMilliNtz(
423                VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
424            ),
425            DataType::Timestamp(TimeUnit::Millisecond, tz) => TimestampMilli(
426                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
427            ),
428            DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
429                VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
430            ),
431            DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
432                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
433            ),
434            DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
435                VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
436            ),
437            DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
438                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
439            ),
440            DataType::Duration(_) | DataType::Interval(_) => {
441                return Err(ArrowError::InvalidArgumentError(
442                    "Casting Variant to duration/interval types is not supported. \
443                    The Variant format does not define duration/interval types."
444                        .to_string(),
445                ));
446            }
447            DataType::Binary => Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)),
448            DataType::LargeBinary => {
449                LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
450            }
451            DataType::BinaryView => {
452                BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
453            }
454            DataType::FixedSizeBinary(16) => {
455                Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
456            }
457            DataType::FixedSizeBinary(_) => {
458                return Err(ArrowError::NotYetImplemented(format!(
459                    "DataType {data_type:?} not yet implemented"
460                )));
461            }
462            DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)),
463            DataType::LargeUtf8 => {
464                LargeString(VariantToStringArrowBuilder::new(cast_options, capacity))
465            }
466            DataType::Utf8View => {
467                StringView(VariantToStringArrowBuilder::new(cast_options, capacity))
468            }
469            DataType::List(_)
470            | DataType::LargeList(_)
471            | DataType::ListView(_)
472            | DataType::LargeListView(_)
473            | DataType::FixedSizeList(..)
474            | DataType::Struct(_)
475            | DataType::Map(..)
476            | DataType::Union(..)
477            | DataType::Dictionary(..)
478            | DataType::RunEndEncoded(..) => {
479                return Err(ArrowError::InvalidArgumentError(format!(
480                    "Casting to {data_type:?} is not applicable for primitive Variant types"
481                )));
482            }
483        };
484    Ok(builder)
485}
486
487pub(crate) enum ArrayVariantToArrowRowBuilder<'a> {
488    List(VariantToListArrowRowBuilder<'a, i32, false>),
489    LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
490    ListView(VariantToListArrowRowBuilder<'a, i32, true>),
491    LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
492}
493
494impl<'a> ArrayVariantToArrowRowBuilder<'a> {
495    pub(crate) fn try_new(
496        data_type: &'a DataType,
497        cast_options: &'a CastOptions,
498        capacity: usize,
499    ) -> Result<Self> {
500        use ArrayVariantToArrowRowBuilder::*;
501
502        // Make List/ListView builders without repeating the constructor boilerplate.
503        macro_rules! make_list_builder {
504            ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => {
505                $variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new(
506                    $field.clone(),
507                    $field.data_type(),
508                    cast_options,
509                    capacity,
510                )?)
511            };
512        }
513
514        let builder = match data_type {
515            DataType::List(field) => make_list_builder!(List, i32, false, field),
516            DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field),
517            DataType::ListView(field) => make_list_builder!(ListView, i32, true, field),
518            DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field),
519            DataType::FixedSizeList(..) => {
520                return Err(ArrowError::NotYetImplemented(
521                    "Converting unshredded variant arrays to arrow fixed-size lists".to_string(),
522                ));
523            }
524            other => {
525                return Err(ArrowError::InvalidArgumentError(format!(
526                    "Casting to {other:?} is not applicable for array Variant types"
527                )));
528            }
529        };
530        Ok(builder)
531    }
532
533    pub(crate) fn append_null(&mut self) -> Result<()> {
534        match self {
535            Self::List(builder) => builder.append_null(),
536            Self::LargeList(builder) => builder.append_null(),
537            Self::ListView(builder) => builder.append_null(),
538            Self::LargeListView(builder) => builder.append_null(),
539        }
540    }
541
542    pub(crate) fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
543        match self {
544            Self::List(builder) => builder.append_value(value),
545            Self::LargeList(builder) => builder.append_value(value),
546            Self::ListView(builder) => builder.append_value(value),
547            Self::LargeListView(builder) => builder.append_value(value),
548        }
549    }
550
551    pub(crate) fn finish(self) -> Result<ArrayRef> {
552        match self {
553            Self::List(builder) => builder.finish(),
554            Self::LargeList(builder) => builder.finish(),
555            Self::ListView(builder) => builder.finish(),
556            Self::LargeListView(builder) => builder.finish(),
557        }
558    }
559}
560
561/// A thin wrapper whose only job is to extract a specific path from a variant value and pass the
562/// result to a nested builder.
563pub(crate) struct VariantPathRowBuilder<'a> {
564    builder: Box<VariantToArrowRowBuilder<'a>>,
565    path: VariantPath<'a>,
566}
567
568impl<'a> VariantPathRowBuilder<'a> {
569    fn append_null(&mut self) -> Result<()> {
570        self.builder.append_null()
571    }
572
573    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
574        if let Some(v) = value.get_path(&self.path) {
575            self.builder.append_value(v)
576        } else {
577            self.builder.append_null()?;
578            Ok(false)
579        }
580    }
581
582    fn finish(self) -> Result<ArrayRef> {
583        self.builder.finish()
584    }
585}
586
587macro_rules! define_variant_to_primitive_builder {
588    (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
589    |$array_param:ident $(, $field:ident: $field_type:ty)?| -> $builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
590    |$value: ident| $value_transform:expr,
591    type_name: $type_name:expr) => {
592        pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
593        {
594            builder: $builder_name $(<$array_type>)?,
595            cast_options: &$lifetime CastOptions<$lifetime>,
596        }
597
598        impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic )?> {
599            fn new(
600                cast_options: &$lifetime CastOptions<$lifetime>,
601                $array_param: usize,
602                // add this so that $init_expr can use it
603                $( $field: $field_type, )?
604            ) -> Self {
605                Self {
606                    builder: $init_expr,
607                    cast_options,
608                }
609            }
610
611            fn append_null(&mut self) -> Result<()> {
612                self.builder.append_null();
613                Ok(())
614            }
615
616            fn append_value(&mut self, $value: &Variant<'_, '_>) -> Result<bool> {
617                if let Some(v) = $value_transform {
618                    self.builder.append_value(v);
619                    Ok(true)
620                } else {
621                    if !self.cast_options.safe {
622                        // Unsafe casting: return error on conversion failure
623                        return Err(ArrowError::CastError(format!(
624                            "Failed to extract primitive of type {} from variant {:?} at path VariantPath([])",
625                            $type_name,
626                            $value
627                        )));
628                    }
629                    // Safe casting: append null on conversion failure
630                    self.builder.append_null();
631                    Ok(false)
632                }
633            }
634
635            // Add this to silence unused mut warning from macro-generated code
636            // This is mainly for `FakeNullBuilder`
637            #[allow(unused_mut)]
638            fn finish(mut self) -> Result<ArrayRef> {
639                // If the builder produces T: Array, the compiler infers `<Arc<T> as From<T>>::from`
640                // (which then coerces to ArrayRef). If the builder produces ArrayRef directly, the
641                // compiler infers `<ArrayRef as From<ArrayRef>>::from` (no-op, From blanket impl).
642                Ok(Arc::from(self.builder.finish()))
643            }
644        }
645    }
646}
647
648define_variant_to_primitive_builder!(
649    struct VariantToStringArrowBuilder<'a, B: StringLikeArrayBuilder>
650    |capacity| -> B { B::with_capacity(capacity) },
651    |value| value.as_string(),
652    type_name: B::type_name()
653);
654
655define_variant_to_primitive_builder!(
656    struct VariantToBooleanArrowRowBuilder<'a>
657    |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
658    |value|  value.as_boolean(),
659    type_name: datatypes::BooleanType::DATA_TYPE
660);
661
662define_variant_to_primitive_builder!(
663    struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
664    |capacity| -> PrimitiveBuilder<T> { PrimitiveBuilder::<T>::with_capacity(capacity) },
665    |value| T::from_variant(value),
666    type_name: T::DATA_TYPE
667);
668
669define_variant_to_primitive_builder!(
670    struct VariantToTimestampNtzArrowRowBuilder<'a, T:TimestampFromVariant<true>>
671    |capacity| -> PrimitiveBuilder<T> { PrimitiveBuilder::<T>::with_capacity(capacity) },
672    |value| T::from_variant(value),
673    type_name: T::DATA_TYPE
674);
675
676define_variant_to_primitive_builder!(
677    struct VariantToTimestampArrowRowBuilder<'a, T:TimestampFromVariant<false>>
678    |capacity, tz: Option<Arc<str>> | -> PrimitiveBuilder<T> {
679        PrimitiveBuilder::<T>::with_capacity(capacity).with_timezone_opt(tz)
680    },
681    |value| T::from_variant(value),
682    type_name: T::DATA_TYPE
683);
684
685define_variant_to_primitive_builder!(
686    struct VariantToBinaryArrowRowBuilder<'a, B: BinaryLikeArrayBuilder>
687    |capacity| -> B { B::with_capacity(capacity) },
688    |value| value.as_u8_slice(),
689    type_name: B::type_name()
690);
691
692/// Builder for converting variant values to arrow Decimal values
693pub(crate) struct VariantToDecimalArrowRowBuilder<'a, T>
694where
695    T: DecimalType,
696    T::Native: DecimalCast,
697{
698    builder: PrimitiveBuilder<T>,
699    cast_options: &'a CastOptions<'a>,
700    precision: u8,
701    scale: i8,
702}
703
704impl<'a, T> VariantToDecimalArrowRowBuilder<'a, T>
705where
706    T: DecimalType,
707    T::Native: DecimalCast,
708{
709    fn new(
710        cast_options: &'a CastOptions<'a>,
711        capacity: usize,
712        precision: u8,
713        scale: i8,
714    ) -> Result<Self> {
715        let builder = PrimitiveBuilder::<T>::with_capacity(capacity)
716            .with_precision_and_scale(precision, scale)?;
717        Ok(Self {
718            builder,
719            cast_options,
720            precision,
721            scale,
722        })
723    }
724
725    fn append_null(&mut self) -> Result<()> {
726        self.builder.append_null();
727        Ok(())
728    }
729
730    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
731        if let Some(scaled) = variant_to_unscaled_decimal::<T>(value, self.precision, self.scale) {
732            self.builder.append_value(scaled);
733            Ok(true)
734        } else if self.cast_options.safe {
735            self.builder.append_null();
736            Ok(false)
737        } else {
738            Err(ArrowError::CastError(format!(
739                "Failed to cast to {}(precision={}, scale={}) from variant {:?}",
740                T::PREFIX,
741                self.precision,
742                self.scale,
743                value
744            )))
745        }
746    }
747
748    fn finish(mut self) -> Result<ArrayRef> {
749        Ok(Arc::new(self.builder.finish()))
750    }
751}
752
753/// Builder for converting variant values to FixedSizeBinary(16) for UUIDs
754pub(crate) struct VariantToUuidArrowRowBuilder<'a> {
755    builder: FixedSizeBinaryBuilder,
756    cast_options: &'a CastOptions<'a>,
757}
758
759impl<'a> VariantToUuidArrowRowBuilder<'a> {
760    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
761        Self {
762            builder: FixedSizeBinaryBuilder::with_capacity(capacity, 16),
763            cast_options,
764        }
765    }
766
767    fn append_null(&mut self) -> Result<()> {
768        self.builder.append_null();
769        Ok(())
770    }
771
772    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
773        match value.as_uuid() {
774            Some(uuid) => {
775                self.builder
776                    .append_value(uuid.as_bytes())
777                    .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
778
779                Ok(true)
780            }
781            None if self.cast_options.safe => {
782                self.builder.append_null();
783                Ok(false)
784            }
785            None => Err(ArrowError::CastError(format!(
786                "Failed to extract UUID from variant {value:?}",
787            ))),
788        }
789    }
790
791    fn finish(mut self) -> Result<ArrayRef> {
792        Ok(Arc::new(self.builder.finish()))
793    }
794}
795
796pub(crate) struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool>
797where
798    O: OffsetSizeTrait + ArrowNativeTypeOp,
799{
800    field: FieldRef,
801    offsets: Vec<O>,
802    element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
803    nulls: NullBufferBuilder,
804    current_offset: O,
805    cast_options: &'a CastOptions<'a>,
806}
807
808impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW>
809where
810    O: OffsetSizeTrait + ArrowNativeTypeOp,
811{
812    fn try_new(
813        field: FieldRef,
814        element_data_type: &'a DataType,
815        cast_options: &'a CastOptions,
816        capacity: usize,
817    ) -> Result<Self> {
818        if capacity >= isize::MAX as usize {
819            return Err(ArrowError::ComputeError(
820                "Capacity exceeds isize::MAX when reserving list offsets".to_string(),
821            ));
822        }
823        let mut offsets = Vec::with_capacity(capacity + 1);
824        offsets.push(O::ZERO);
825        let element_builder = make_variant_to_shredded_variant_arrow_row_builder(
826            element_data_type,
827            cast_options,
828            capacity,
829            false,
830        )?;
831        Ok(Self {
832            field,
833            offsets,
834            element_builder: Box::new(element_builder),
835            nulls: NullBufferBuilder::new(capacity),
836            current_offset: O::ZERO,
837            cast_options,
838        })
839    }
840
841    fn append_null(&mut self) -> Result<()> {
842        self.offsets.push(self.current_offset);
843        self.nulls.append_null();
844        Ok(())
845    }
846
847    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
848        match value {
849            Variant::List(list) => {
850                for element in list.iter() {
851                    self.element_builder.append_value(element)?;
852                    self.current_offset = self.current_offset.add_checked(O::ONE)?;
853                }
854                self.offsets.push(self.current_offset);
855                self.nulls.append_non_null();
856                Ok(true)
857            }
858            _ if self.cast_options.safe => {
859                self.append_null()?;
860                Ok(false)
861            }
862            _ => Err(ArrowError::CastError(format!(
863                "Failed to extract list from variant {:?}",
864                value
865            ))),
866        }
867    }
868
869    fn finish(mut self) -> Result<ArrayRef> {
870        let (value, typed_value, nulls) = self.element_builder.finish()?;
871        let element_array =
872            ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
873        let field = Arc::new(
874            self.field
875                .as_ref()
876                .clone()
877                .with_data_type(element_array.data_type().clone()),
878        );
879
880        if IS_VIEW {
881            // NOTE: `offsets` is never empty (constructor pushes an entry)
882            let mut sizes = Vec::with_capacity(self.offsets.len() - 1);
883            for i in 1..self.offsets.len() {
884                sizes.push(self.offsets[i] - self.offsets[i - 1]);
885            }
886            self.offsets.pop();
887            let list_view_array = GenericListViewArray::<O>::new(
888                field,
889                ScalarBuffer::from(self.offsets),
890                ScalarBuffer::from(sizes),
891                ArrayRef::from(element_array),
892                self.nulls.finish(),
893            );
894            Ok(Arc::new(list_view_array))
895        } else {
896            let list_array = GenericListArray::<O>::new(
897                field,
898                OffsetBuffer::<O>::new(ScalarBuffer::from(self.offsets)),
899                ArrayRef::from(element_array),
900                self.nulls.finish(),
901            );
902            Ok(Arc::new(list_array))
903        }
904    }
905}
906
907/// Builder for creating VariantArray output (for path extraction without type conversion)
908pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
909    metadata: BinaryViewArray,
910    builder: VariantValueArrayBuilder,
911    nulls: NullBufferBuilder,
912}
913
914impl VariantToBinaryVariantArrowRowBuilder {
915    fn new(metadata: BinaryViewArray, capacity: usize) -> Self {
916        Self {
917            metadata,
918            builder: VariantValueArrayBuilder::new(capacity),
919            nulls: NullBufferBuilder::new(capacity),
920        }
921    }
922}
923
924impl VariantToBinaryVariantArrowRowBuilder {
925    fn append_null(&mut self) -> Result<()> {
926        self.builder.append_null();
927        self.nulls.append_null();
928        Ok(())
929    }
930
931    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
932        self.builder.append_value(value);
933        self.nulls.append_non_null();
934        Ok(true)
935    }
936
937    fn finish(mut self) -> Result<ArrayRef> {
938        let variant_array = VariantArray::from_parts(
939            self.metadata,
940            Some(self.builder.build()?),
941            None, // no typed_value column
942            self.nulls.finish(),
943        );
944
945        Ok(ArrayRef::from(variant_array))
946    }
947}
948
949#[derive(Default)]
950struct FakeNullBuilder {
951    item_count: usize,
952}
953
954impl FakeNullBuilder {
955    fn append_value(&mut self, _: ()) {
956        self.item_count += 1;
957    }
958
959    fn append_null(&mut self) {
960        self.item_count += 1;
961    }
962
963    fn finish(self) -> NullArray {
964        NullArray::new(self.item_count)
965    }
966}
967
968define_variant_to_primitive_builder!(
969    struct VariantToNullArrowRowBuilder<'a>
970    |_capacity| -> FakeNullBuilder { FakeNullBuilder::default() },
971    |value| value.as_null(),
972    type_name: "Null"
973);
974
975#[cfg(test)]
976mod tests {
977    use super::make_primitive_variant_to_arrow_row_builder;
978    use arrow::compute::CastOptions;
979    use arrow::datatypes::{DataType, Field, Fields, UnionFields, UnionMode};
980    use arrow::error::ArrowError;
981    use std::sync::Arc;
982
983    #[test]
984    fn make_primitive_builder_rejects_non_primitive_types() {
985        let cast_options = CastOptions::default();
986        let item_field = Arc::new(Field::new("item", DataType::Int32, true));
987        let struct_fields = Fields::from(vec![Field::new("child", DataType::Int32, true)]);
988        let map_entries_field = Arc::new(Field::new(
989            "entries",
990            DataType::Struct(Fields::from(vec![
991                Field::new("key", DataType::Utf8, false),
992                Field::new("value", DataType::Float64, true),
993            ])),
994            true,
995        ));
996        let union_fields =
997            UnionFields::try_new(vec![1], vec![Field::new("child", DataType::Int32, true)])
998                .unwrap();
999        let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false));
1000        let ree_values_field = Arc::new(Field::new("values", DataType::Utf8, true));
1001
1002        let non_primitive_types = vec![
1003            DataType::List(item_field.clone()),
1004            DataType::LargeList(item_field.clone()),
1005            DataType::ListView(item_field.clone()),
1006            DataType::LargeListView(item_field.clone()),
1007            DataType::FixedSizeList(item_field.clone(), 2),
1008            DataType::Struct(struct_fields.clone()),
1009            DataType::Map(map_entries_field.clone(), false),
1010            DataType::Union(union_fields.clone(), UnionMode::Dense),
1011            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1012            DataType::RunEndEncoded(run_ends_field.clone(), ree_values_field.clone()),
1013        ];
1014
1015        for data_type in non_primitive_types {
1016            let err =
1017                match make_primitive_variant_to_arrow_row_builder(&data_type, &cast_options, 1) {
1018                    Ok(_) => panic!("non-primitive type {data_type:?} should be rejected"),
1019                    Err(err) => err,
1020                };
1021
1022            match err {
1023                ArrowError::InvalidArgumentError(msg) => {
1024                    assert!(msg.contains(&format!("{data_type:?}")));
1025                }
1026                other => panic!("expected InvalidArgumentError, got {other:?}"),
1027            }
1028        }
1029    }
1030}