parquet_variant_compute/variant_get/output/
variant.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::variant_get::output::OutputBuilder;
19use crate::{VariantArray, VariantArrayBuilder};
20use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray};
21use arrow::datatypes::Int32Type;
22use arrow_schema::{ArrowError, DataType};
23use parquet_variant::{Variant, VariantPath};
24use std::sync::Arc;
25
26/// Outputs VariantArrays
27pub(super) struct VariantOutputBuilder<'a> {
28    /// What path to extract
29    path: VariantPath<'a>,
30}
31
32impl<'a> VariantOutputBuilder<'a> {
33    pub(super) fn new(path: VariantPath<'a>) -> Self {
34        Self { path }
35    }
36}
37
38impl<'a> OutputBuilder for VariantOutputBuilder<'a> {
39    fn partially_shredded(
40        &self,
41        variant_array: &VariantArray,
42        // TODO(perf): can reuse the metadata field here to avoid re-creating it
43        _metadata: &BinaryViewArray,
44        _value_field: &BinaryViewArray,
45        typed_value: &ArrayRef,
46    ) -> arrow::error::Result<ArrayRef> {
47        // in this case dispatch on the typed_value and
48        // TODO macro'ize this using downcast! to handle all other primitive types
49        // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same)
50        let mut array_builder = VariantArrayBuilder::new(variant_array.len());
51        match typed_value.data_type() {
52            DataType::Int32 => {
53                let primitive_array = typed_value.as_primitive::<Int32Type>();
54                for i in 0..variant_array.len() {
55                    if variant_array.is_null(i) {
56                        array_builder.append_null();
57                        continue;
58                    }
59
60                    if typed_value.is_null(i) {
61                        // fall back to the value (variant) field
62                        // (TODO could copy the variant bytes directly)
63                        let value = variant_array.value(i);
64                        array_builder.append_variant(value);
65                        continue;
66                    }
67
68                    // otherwise we have a typed value, so we can use it directly
69                    let int_value = primitive_array.value(i);
70                    array_builder.append_variant(Variant::from(int_value));
71                }
72            }
73            dt => {
74                // https://github.com/apache/arrow-rs/issues/8086
75                return Err(ArrowError::NotYetImplemented(format!(
76                    "variant_get fully_shredded with typed_value={dt} is not implemented yet",
77                )));
78            }
79        };
80        Ok(Arc::new(array_builder.build()))
81    }
82
83    fn typed(
84        &self,
85        variant_array: &VariantArray,
86        // TODO(perf): can reuse the metadata field here to avoid re-creating it
87        _metadata: &BinaryViewArray,
88        typed_value: &ArrayRef,
89    ) -> arrow::error::Result<ArrayRef> {
90        // in this case dispatch on the typed_value and
91        // TODO macro'ize this using downcast! to handle all other primitive types
92        // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same)
93        let mut array_builder = VariantArrayBuilder::new(variant_array.len());
94        match typed_value.data_type() {
95            DataType::Int32 => {
96                let primitive_array = typed_value.as_primitive::<Int32Type>();
97                for i in 0..variant_array.len() {
98                    if primitive_array.is_null(i) {
99                        array_builder.append_null();
100                        continue;
101                    }
102
103                    let int_value = primitive_array.value(i);
104                    array_builder.append_variant(Variant::from(int_value));
105                }
106            }
107            dt => {
108                // https://github.com/apache/arrow-rs/issues/8087
109                return Err(ArrowError::NotYetImplemented(format!(
110                    "variant_get fully_shredded with typed_value={dt} is not implemented yet",
111                )));
112            }
113        };
114        Ok(Arc::new(array_builder.build()))
115    }
116
117    fn unshredded(
118        &self,
119        variant_array: &VariantArray,
120        _metadata: &BinaryViewArray,
121        _value_field: &BinaryViewArray,
122    ) -> arrow::error::Result<ArrayRef> {
123        let mut builder = VariantArrayBuilder::new(variant_array.len());
124        for i in 0..variant_array.len() {
125            let new_variant = variant_array.value(i);
126
127            // TODO: perf?
128            let Some(new_variant) = new_variant.get_path(&self.path) else {
129                // path not found, append null
130                builder.append_null();
131                continue;
132            };
133
134            // TODO: we're decoding the value and doing a copy into a variant value
135            // again. This can be much faster by using the _metadata and _value_field
136            // to avoid decoding the entire variant:
137            //
138            // 1) reuse the metadata arrays as is
139            //
140            // 2) Create a new BinaryViewArray that uses the same underlying buffers
141            // that the original variant used, but whose views points to a new
142            // offset for the new path
143            builder.append_variant(new_variant);
144        }
145
146        Ok(Arc::new(builder.build()))
147    }
148}