parquet_variant_compute/variant_get/output/variant.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::variant_get::output::OutputBuilder;
19use crate::{VariantArray, VariantArrayBuilder};
20use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray};
21use arrow::datatypes::Int32Type;
22use arrow_schema::{ArrowError, DataType};
23use parquet_variant::{Variant, VariantPath};
24use std::sync::Arc;
25
26/// Outputs VariantArrays
27pub(super) struct VariantOutputBuilder<'a> {
28 /// What path to extract
29 path: VariantPath<'a>,
30}
31
32impl<'a> VariantOutputBuilder<'a> {
33 pub(super) fn new(path: VariantPath<'a>) -> Self {
34 Self { path }
35 }
36}
37
38impl<'a> OutputBuilder for VariantOutputBuilder<'a> {
39 fn partially_shredded(
40 &self,
41 variant_array: &VariantArray,
42 // TODO(perf): can reuse the metadata field here to avoid re-creating it
43 _metadata: &BinaryViewArray,
44 _value_field: &BinaryViewArray,
45 typed_value: &ArrayRef,
46 ) -> arrow::error::Result<ArrayRef> {
47 // in this case dispatch on the typed_value and
48 // TODO macro'ize this using downcast! to handle all other primitive types
49 // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same)
50 let mut array_builder = VariantArrayBuilder::new(variant_array.len());
51 match typed_value.data_type() {
52 DataType::Int32 => {
53 let primitive_array = typed_value.as_primitive::<Int32Type>();
54 for i in 0..variant_array.len() {
55 if variant_array.is_null(i) {
56 array_builder.append_null();
57 continue;
58 }
59
60 if typed_value.is_null(i) {
61 // fall back to the value (variant) field
62 // (TODO could copy the variant bytes directly)
63 let value = variant_array.value(i);
64 array_builder.append_variant(value);
65 continue;
66 }
67
68 // otherwise we have a typed value, so we can use it directly
69 let int_value = primitive_array.value(i);
70 array_builder.append_variant(Variant::from(int_value));
71 }
72 }
73 dt => {
74 // https://github.com/apache/arrow-rs/issues/8086
75 return Err(ArrowError::NotYetImplemented(format!(
76 "variant_get fully_shredded with typed_value={dt} is not implemented yet",
77 )));
78 }
79 };
80 Ok(Arc::new(array_builder.build()))
81 }
82
83 fn typed(
84 &self,
85 variant_array: &VariantArray,
86 // TODO(perf): can reuse the metadata field here to avoid re-creating it
87 _metadata: &BinaryViewArray,
88 typed_value: &ArrayRef,
89 ) -> arrow::error::Result<ArrayRef> {
90 // in this case dispatch on the typed_value and
91 // TODO macro'ize this using downcast! to handle all other primitive types
92 // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same)
93 let mut array_builder = VariantArrayBuilder::new(variant_array.len());
94 match typed_value.data_type() {
95 DataType::Int32 => {
96 let primitive_array = typed_value.as_primitive::<Int32Type>();
97 for i in 0..variant_array.len() {
98 if primitive_array.is_null(i) {
99 array_builder.append_null();
100 continue;
101 }
102
103 let int_value = primitive_array.value(i);
104 array_builder.append_variant(Variant::from(int_value));
105 }
106 }
107 dt => {
108 // https://github.com/apache/arrow-rs/issues/8087
109 return Err(ArrowError::NotYetImplemented(format!(
110 "variant_get fully_shredded with typed_value={dt} is not implemented yet",
111 )));
112 }
113 };
114 Ok(Arc::new(array_builder.build()))
115 }
116
117 fn unshredded(
118 &self,
119 variant_array: &VariantArray,
120 _metadata: &BinaryViewArray,
121 _value_field: &BinaryViewArray,
122 ) -> arrow::error::Result<ArrayRef> {
123 let mut builder = VariantArrayBuilder::new(variant_array.len());
124 for i in 0..variant_array.len() {
125 let new_variant = variant_array.value(i);
126
127 // TODO: perf?
128 let Some(new_variant) = new_variant.get_path(&self.path) else {
129 // path not found, append null
130 builder.append_null();
131 continue;
132 };
133
134 // TODO: we're decoding the value and doing a copy into a variant value
135 // again. This can be much faster by using the _metadata and _value_field
136 // to avoid decoding the entire variant:
137 //
138 // 1) reuse the metadata arrays as is
139 //
140 // 2) Create a new BinaryViewArray that uses the same underlying buffers
141 // that the original variant used, but whose views points to a new
142 // offset for the new path
143 builder.append_variant(new_variant);
144 }
145
146 Ok(Arc::new(builder.build()))
147 }
148}