parquet_variant_compute/
variant_to_arrow.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder};
19use arrow::compute::CastOptions;
20use arrow::datatypes::{self, ArrowPrimitiveType, DataType};
21use arrow::error::{ArrowError, Result};
22use parquet_variant::{Variant, VariantPath};
23
24use crate::type_conversion::PrimitiveFromVariant;
25use crate::{VariantArray, VariantValueArrayBuilder};
26
27use std::sync::Arc;
28
29/// Builder for converting variant values to primitive Arrow arrays. It is used by both
30/// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in
31/// `shred_variant.rs`).
32pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
33    Int8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int8Type>),
34    Int16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int16Type>),
35    Int32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int32Type>),
36    Int64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int64Type>),
37    UInt8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt8Type>),
38    UInt16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt16Type>),
39    UInt32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt32Type>),
40    UInt64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt64Type>),
41    Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>),
42    Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>),
43    Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>),
44}
45
46/// Builder for converting variant values into strongly typed Arrow arrays.
47///
48/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly
49/// with casting of leaf values to specific types.
50pub(crate) enum VariantToArrowRowBuilder<'a> {
51    Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
52    BinaryVariant(VariantToBinaryVariantArrowRowBuilder),
53
54    // Path extraction wrapper - contains a boxed enum for any of the above
55    WithPath(VariantPathRowBuilder<'a>),
56}
57
58impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
59    pub fn append_null(&mut self) -> Result<()> {
60        use PrimitiveVariantToArrowRowBuilder::*;
61        match self {
62            Int8(b) => b.append_null(),
63            Int16(b) => b.append_null(),
64            Int32(b) => b.append_null(),
65            Int64(b) => b.append_null(),
66            UInt8(b) => b.append_null(),
67            UInt16(b) => b.append_null(),
68            UInt32(b) => b.append_null(),
69            UInt64(b) => b.append_null(),
70            Float16(b) => b.append_null(),
71            Float32(b) => b.append_null(),
72            Float64(b) => b.append_null(),
73        }
74    }
75
76    pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
77        use PrimitiveVariantToArrowRowBuilder::*;
78        match self {
79            Int8(b) => b.append_value(value),
80            Int16(b) => b.append_value(value),
81            Int32(b) => b.append_value(value),
82            Int64(b) => b.append_value(value),
83            UInt8(b) => b.append_value(value),
84            UInt16(b) => b.append_value(value),
85            UInt32(b) => b.append_value(value),
86            UInt64(b) => b.append_value(value),
87            Float16(b) => b.append_value(value),
88            Float32(b) => b.append_value(value),
89            Float64(b) => b.append_value(value),
90        }
91    }
92
93    pub fn finish(self) -> Result<ArrayRef> {
94        use PrimitiveVariantToArrowRowBuilder::*;
95        match self {
96            Int8(b) => b.finish(),
97            Int16(b) => b.finish(),
98            Int32(b) => b.finish(),
99            Int64(b) => b.finish(),
100            UInt8(b) => b.finish(),
101            UInt16(b) => b.finish(),
102            UInt32(b) => b.finish(),
103            UInt64(b) => b.finish(),
104            Float16(b) => b.finish(),
105            Float32(b) => b.finish(),
106            Float64(b) => b.finish(),
107        }
108    }
109}
110
111impl<'a> VariantToArrowRowBuilder<'a> {
112    pub fn append_null(&mut self) -> Result<()> {
113        use VariantToArrowRowBuilder::*;
114        match self {
115            Primitive(b) => b.append_null(),
116            BinaryVariant(b) => b.append_null(),
117            WithPath(path_builder) => path_builder.append_null(),
118        }
119    }
120
121    pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
122        use VariantToArrowRowBuilder::*;
123        match self {
124            Primitive(b) => b.append_value(&value),
125            BinaryVariant(b) => b.append_value(value),
126            WithPath(path_builder) => path_builder.append_value(value),
127        }
128    }
129
130    pub fn finish(self) -> Result<ArrayRef> {
131        use VariantToArrowRowBuilder::*;
132        match self {
133            Primitive(b) => b.finish(),
134            BinaryVariant(b) => b.finish(),
135            WithPath(path_builder) => path_builder.finish(),
136        }
137    }
138}
139
140/// Creates a primitive row builder, returning Err if the requested data type is not primitive.
141pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
142    data_type: &'a DataType,
143    cast_options: &'a CastOptions,
144    capacity: usize,
145) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
146    use PrimitiveVariantToArrowRowBuilder::*;
147
148    let builder = match data_type {
149        DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
150            cast_options,
151            capacity,
152        )),
153        DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
154            cast_options,
155            capacity,
156        )),
157        DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
158            cast_options,
159            capacity,
160        )),
161        DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
162            cast_options,
163            capacity,
164        )),
165        DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
166            cast_options,
167            capacity,
168        )),
169        DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
170            cast_options,
171            capacity,
172        )),
173        DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
174            cast_options,
175            capacity,
176        )),
177        DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
178            cast_options,
179            capacity,
180        )),
181        DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
182            cast_options,
183            capacity,
184        )),
185        DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
186            cast_options,
187            capacity,
188        )),
189        DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
190            cast_options,
191            capacity,
192        )),
193        _ if data_type.is_primitive() => {
194            return Err(ArrowError::NotYetImplemented(format!(
195                "Primitive data_type {data_type:?} not yet implemented"
196            )));
197        }
198        _ => {
199            return Err(ArrowError::InvalidArgumentError(format!(
200                "Not a primitive type: {data_type:?}"
201            )));
202        }
203    };
204    Ok(builder)
205}
206
207pub(crate) fn make_variant_to_arrow_row_builder<'a>(
208    metadata: &BinaryViewArray,
209    path: VariantPath<'a>,
210    data_type: Option<&'a DataType>,
211    cast_options: &'a CastOptions,
212    capacity: usize,
213) -> Result<VariantToArrowRowBuilder<'a>> {
214    use VariantToArrowRowBuilder::*;
215
216    let mut builder = match data_type {
217        // If no data type was requested, build an unshredded VariantArray.
218        None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(
219            metadata.clone(),
220            capacity,
221        )),
222        Some(DataType::Struct(_)) => {
223            return Err(ArrowError::NotYetImplemented(
224                "Converting unshredded variant objects to arrow structs".to_string(),
225            ));
226        }
227        Some(
228            DataType::List(_)
229            | DataType::LargeList(_)
230            | DataType::ListView(_)
231            | DataType::LargeListView(_)
232            | DataType::FixedSizeList(..),
233        ) => {
234            return Err(ArrowError::NotYetImplemented(
235                "Converting unshredded variant arrays to arrow lists".to_string(),
236            ));
237        }
238        Some(data_type) => {
239            let builder =
240                make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
241            Primitive(builder)
242        }
243    };
244
245    // Wrap with path extraction if needed
246    if !path.is_empty() {
247        builder = WithPath(VariantPathRowBuilder {
248            builder: Box::new(builder),
249            path,
250        })
251    };
252
253    Ok(builder)
254}
255
256/// A thin wrapper whose only job is to extract a specific path from a variant value and pass the
257/// result to a nested builder.
258pub(crate) struct VariantPathRowBuilder<'a> {
259    builder: Box<VariantToArrowRowBuilder<'a>>,
260    path: VariantPath<'a>,
261}
262
263impl<'a> VariantPathRowBuilder<'a> {
264    fn append_null(&mut self) -> Result<()> {
265        self.builder.append_null()
266    }
267
268    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
269        if let Some(v) = value.get_path(&self.path) {
270            self.builder.append_value(v)
271        } else {
272            self.builder.append_null()?;
273            Ok(false)
274        }
275    }
276
277    fn finish(self) -> Result<ArrayRef> {
278        self.builder.finish()
279    }
280}
281
282/// Helper function to get a user-friendly type name
283fn get_type_name<T: ArrowPrimitiveType>() -> &'static str {
284    match std::any::type_name::<T>() {
285        "arrow_array::types::Int32Type" => "Int32",
286        "arrow_array::types::Int16Type" => "Int16",
287        "arrow_array::types::Int8Type" => "Int8",
288        "arrow_array::types::Int64Type" => "Int64",
289        "arrow_array::types::UInt32Type" => "UInt32",
290        "arrow_array::types::UInt16Type" => "UInt16",
291        "arrow_array::types::UInt8Type" => "UInt8",
292        "arrow_array::types::UInt64Type" => "UInt64",
293        "arrow_array::types::Float32Type" => "Float32",
294        "arrow_array::types::Float64Type" => "Float64",
295        "arrow_array::types::Float16Type" => "Float16",
296        _ => "Unknown",
297    }
298}
299
300/// Builder for converting variant values to primitive values
301pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: PrimitiveFromVariant> {
302    builder: arrow::array::PrimitiveBuilder<T>,
303    cast_options: &'a CastOptions<'a>,
304}
305
306impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
307    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
308        Self {
309            builder: PrimitiveBuilder::<T>::with_capacity(capacity),
310            cast_options,
311        }
312    }
313}
314
315impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
316    fn append_null(&mut self) -> Result<()> {
317        self.builder.append_null();
318        Ok(())
319    }
320
321    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
322        if let Some(v) = T::from_variant(value) {
323            self.builder.append_value(v);
324            Ok(true)
325        } else {
326            if !self.cast_options.safe {
327                // Unsafe casting: return error on conversion failure
328                return Err(ArrowError::CastError(format!(
329                    "Failed to extract primitive of type {} from variant {:?} at path VariantPath([])",
330                    get_type_name::<T>(),
331                    value
332                )));
333            }
334            // Safe casting: append null on conversion failure
335            self.builder.append_null();
336            Ok(false)
337        }
338    }
339
340    fn finish(mut self) -> Result<ArrayRef> {
341        Ok(Arc::new(self.builder.finish()))
342    }
343}
344
345/// Builder for creating VariantArray output (for path extraction without type conversion)
346pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
347    metadata: BinaryViewArray,
348    builder: VariantValueArrayBuilder,
349    nulls: NullBufferBuilder,
350}
351
352impl VariantToBinaryVariantArrowRowBuilder {
353    fn new(metadata: BinaryViewArray, capacity: usize) -> Self {
354        Self {
355            metadata,
356            builder: VariantValueArrayBuilder::new(capacity),
357            nulls: NullBufferBuilder::new(capacity),
358        }
359    }
360}
361
362impl VariantToBinaryVariantArrowRowBuilder {
363    fn append_null(&mut self) -> Result<()> {
364        self.builder.append_null();
365        self.nulls.append_null();
366        Ok(())
367    }
368
369    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
370        self.builder.append_value(value);
371        self.nulls.append_non_null();
372        Ok(true)
373    }
374
375    fn finish(mut self) -> Result<ArrayRef> {
376        let variant_array = VariantArray::from_parts(
377            self.metadata,
378            Some(self.builder.build()?),
379            None, // no typed_value column
380            self.nulls.finish(),
381        );
382
383        Ok(ArrayRef::from(variant_array))
384    }
385}