parquet_variant_compute/variant_array_builder.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArrayBuilder`] implementation
19
20use crate::VariantArray;
21use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22use arrow_schema::{DataType, Field, Fields};
23use parquet_variant::{Variant, VariantBuilder};
24use std::sync::Arc;
25
26/// A builder for [`VariantArray`]
27///
28/// This builder is used to construct a `VariantArray` and allows APIs for
29/// adding metadata
30///
31/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
32/// the metadata and value fields.
33///
34/// # TODO
35/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
36///
37/// ## Example:
38/// ```
39/// # use arrow::array::Array;
40/// # use parquet_variant::{Variant, VariantBuilder};
41/// # use parquet_variant_compute::VariantArrayBuilder;
42/// // Create a new VariantArrayBuilder with a capacity of 100 rows
43/// let mut builder = VariantArrayBuilder::new(100);
44/// // append variant values
45/// builder.append_variant(Variant::from(42));
46/// // append a null row
47/// builder.append_null();
48/// // append a pre-constructed metadata and value buffers
49/// let (metadata, value) = {
50/// let mut vb = VariantBuilder::new();
51/// let mut obj = vb.new_object();
52/// obj.insert("foo", "bar");
53/// obj.finish().unwrap();
54/// vb.finish()
55/// };
56/// builder.append_variant_buffers(&metadata, &value);
57///
58/// // create the final VariantArray
59/// let variant_array = builder.build();
60/// assert_eq!(variant_array.len(), 3);
61/// // // Access the values
62/// // row 1 is not null and is an integer
63/// assert!(!variant_array.is_null(0));
64/// assert_eq!(variant_array.value(0), Variant::from(42i32));
65/// // row 1 is null
66/// assert!(variant_array.is_null(1));
67/// // row 2 is not null and is an object
68/// assert!(!variant_array.is_null(2));
69/// assert!(variant_array.value(2).as_object().is_some());
70/// ```
71#[derive(Debug)]
72pub struct VariantArrayBuilder {
73 /// Nulls
74 nulls: NullBufferBuilder,
75 /// buffer for all the metadata
76 metadata_buffer: Vec<u8>,
77 /// (offset, len) pairs for locations of metadata in the buffer
78 metadata_locations: Vec<(usize, usize)>,
79 /// buffer for values
80 value_buffer: Vec<u8>,
81 /// (offset, len) pairs for locations of values in the buffer
82 value_locations: Vec<(usize, usize)>,
83 /// The fields of the final `StructArray`
84 ///
85 /// TODO: 1) Add extension type metadata
86 /// TODO: 2) Add support for shredding
87 fields: Fields,
88}
89
90impl VariantArrayBuilder {
91 pub fn new(row_capacity: usize) -> Self {
92 // The subfields are expected to be non-nullable according to the parquet variant spec.
93 let metadata_field = Field::new("metadata", DataType::BinaryView, false);
94 let value_field = Field::new("value", DataType::BinaryView, false);
95
96 Self {
97 nulls: NullBufferBuilder::new(row_capacity),
98 metadata_buffer: Vec::new(), // todo allocation capacity
99 metadata_locations: Vec::with_capacity(row_capacity),
100 value_buffer: Vec::new(),
101 value_locations: Vec::with_capacity(row_capacity),
102 fields: Fields::from(vec![metadata_field, value_field]),
103 }
104 }
105
106 /// Build the final builder
107 pub fn build(self) -> VariantArray {
108 let Self {
109 mut nulls,
110 metadata_buffer,
111 metadata_locations,
112 value_buffer,
113 value_locations,
114 fields,
115 } = self;
116
117 let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_locations);
118
119 let value_array = binary_view_array_from_buffers(value_buffer, value_locations);
120
121 // The build the final struct array
122 let inner = StructArray::new(
123 fields,
124 vec![
125 Arc::new(metadata_array) as ArrayRef,
126 Arc::new(value_array) as ArrayRef,
127 ],
128 nulls.finish(),
129 );
130 // TODO add arrow extension type metadata
131
132 VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction")
133 }
134
135 /// Appends a null row to the builder.
136 pub fn append_null(&mut self) {
137 self.nulls.append_null();
138 // The subfields are expected to be non-nullable according to the parquet variant spec.
139 let metadata_offset = self.metadata_buffer.len();
140 let metadata_length = 0;
141 self.metadata_locations
142 .push((metadata_offset, metadata_length));
143 let value_offset = self.value_buffer.len();
144 let value_length = 0;
145 self.value_locations.push((value_offset, value_length));
146 }
147
148 /// Append the [`Variant`] to the builder as the next row
149 pub fn append_variant(&mut self, variant: Variant) {
150 // TODO make this more efficient by avoiding the intermediate buffers
151 let mut variant_builder = VariantBuilder::new();
152 variant_builder.append_value(variant);
153 let (metadata, value) = variant_builder.finish();
154 self.append_variant_buffers(&metadata, &value);
155 }
156
157 /// Append a metadata and values buffer to the builder
158 pub fn append_variant_buffers(&mut self, metadata: &[u8], value: &[u8]) {
159 self.nulls.append_non_null();
160 let metadata_length = metadata.len();
161 let metadata_offset = self.metadata_buffer.len();
162 self.metadata_locations
163 .push((metadata_offset, metadata_length));
164 self.metadata_buffer.extend_from_slice(metadata);
165 let value_length = value.len();
166 let value_offset = self.value_buffer.len();
167 self.value_locations.push((value_offset, value_length));
168 self.value_buffer.extend_from_slice(value);
169 }
170
171 // TODO: Return a Variant builder that will write to the underlying buffers (TODO)
172}
173
174fn binary_view_array_from_buffers(
175 buffer: Vec<u8>,
176 locations: Vec<(usize, usize)>,
177) -> BinaryViewArray {
178 let mut builder = BinaryViewBuilder::with_capacity(locations.len());
179 let block = builder.append_block(buffer.into());
180 // TODO this can be much faster if it creates the views directly during append
181 for (offset, length) in locations {
182 let offset = offset.try_into().expect("offset should fit in u32");
183 let length = length.try_into().expect("length should fit in u32");
184 builder
185 .try_append_view(block, offset, length)
186 .expect("Failed to append view");
187 }
188 builder.finish()
189}
190
191#[cfg(test)]
192mod test {
193 use super::*;
194 use arrow::array::Array;
195
196 /// Test that both the metadata and value buffers are non nullable
197 #[test]
198 fn test_variant_array_builder_non_nullable() {
199 let mut builder = VariantArrayBuilder::new(10);
200 builder.append_null(); // should not panic
201 builder.append_variant(Variant::from(42i32));
202 let variant_array = builder.build();
203
204 assert_eq!(variant_array.len(), 2);
205 assert!(variant_array.is_null(0));
206 assert!(!variant_array.is_null(1));
207 assert_eq!(variant_array.value(1), Variant::from(42i32));
208
209 // the metadata and value fields of non shredded variants should not be null
210 assert!(variant_array.metadata_field().nulls().is_none());
211 assert!(variant_array.value_field().nulls().is_none());
212 let DataType::Struct(fields) = variant_array.data_type() else {
213 panic!("Expected VariantArray to have Struct data type");
214 };
215 for field in fields {
216 assert!(
217 !field.is_nullable(),
218 "Field {} should be non-nullable",
219 field.name()
220 );
221 }
222 }
223}