parquet_variant_compute/variant_array_builder.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArrayBuilder`] implementation
19
20use crate::VariantArray;
21use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22use arrow_schema::{ArrowError, DataType, Field, Fields};
23use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilderExt};
24use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder};
25use std::sync::Arc;
26
27/// A builder for [`VariantArray`]
28///
29/// This builder is used to construct a `VariantArray` and allows APIs for
30/// adding metadata
31///
32/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
33/// the metadata and value fields.
34///
35/// # TODO
36/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
37///
38/// ## Example:
39/// ```
40/// # use arrow::array::Array;
41/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
42/// # use parquet_variant_compute::VariantArrayBuilder;
43/// // Create a new VariantArrayBuilder with a capacity of 100 rows
44/// let mut builder = VariantArrayBuilder::new(100);
45/// // append variant values
46/// builder.append_variant(Variant::from(42));
47/// // append a null row (note not a Variant::Null)
48/// builder.append_null();
49/// // append an object to the builder
50/// let mut vb = builder.variant_builder();
51/// vb.new_object()
52/// .with_field("foo", "bar")
53/// .finish();
54/// vb.finish(); // must call finish to write the variant to the buffers
55///
56/// // create the final VariantArray
57/// let variant_array = builder.build();
58/// assert_eq!(variant_array.len(), 3);
59/// // // Access the values
60/// // row 1 is not null and is an integer
61/// assert!(!variant_array.is_null(0));
62/// assert_eq!(variant_array.value(0), Variant::from(42i32));
63/// // row 1 is null
64/// assert!(variant_array.is_null(1));
65/// // row 2 is not null and is an object
66/// assert!(!variant_array.is_null(2));
67/// let value = variant_array.value(2);
68/// let obj = value.as_object().expect("expected object");
69/// assert_eq!(obj.get("foo"), Some(Variant::from("bar")));
70/// ```
71#[derive(Debug)]
72pub struct VariantArrayBuilder {
73 /// Nulls
74 nulls: NullBufferBuilder,
75 /// builder for all the metadata
76 metadata_builder: WritableMetadataBuilder,
77 /// ending offset for each serialized metadata dictionary in the buffer
78 metadata_offsets: Vec<usize>,
79 /// builder for values
80 value_builder: ValueBuilder,
81 /// ending offset for each serialized variant value in the buffer
82 value_offsets: Vec<usize>,
83 /// The fields of the final `StructArray`
84 ///
85 /// TODO: 1) Add extension type metadata
86 /// TODO: 2) Add support for shredding
87 fields: Fields,
88}
89
90impl VariantArrayBuilder {
91 pub fn new(row_capacity: usize) -> Self {
92 // The subfields are expected to be non-nullable according to the parquet variant spec.
93 let metadata_field = Field::new("metadata", DataType::BinaryView, false);
94 let value_field = Field::new("value", DataType::BinaryView, false);
95
96 Self {
97 nulls: NullBufferBuilder::new(row_capacity),
98 metadata_builder: WritableMetadataBuilder::default(),
99 metadata_offsets: Vec::with_capacity(row_capacity),
100 value_builder: ValueBuilder::new(),
101 value_offsets: Vec::with_capacity(row_capacity),
102 fields: Fields::from(vec![metadata_field, value_field]),
103 }
104 }
105
106 /// Build the final builder
107 pub fn build(self) -> VariantArray {
108 let Self {
109 mut nulls,
110 metadata_builder,
111 metadata_offsets,
112 value_builder,
113 value_offsets,
114 fields,
115 } = self;
116
117 let metadata_buffer = metadata_builder.into_inner();
118 let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_offsets);
119
120 let value_buffer = value_builder.into_inner();
121 let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
122
123 // The build the final struct array
124 let inner = StructArray::new(
125 fields,
126 vec![
127 Arc::new(metadata_array) as ArrayRef,
128 Arc::new(value_array) as ArrayRef,
129 ],
130 nulls.finish(),
131 );
132 // TODO add arrow extension type metadata
133
134 VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction")
135 }
136
137 /// Appends a null row to the builder.
138 pub fn append_null(&mut self) {
139 self.nulls.append_null();
140 // The subfields are expected to be non-nullable according to the parquet variant spec.
141 self.metadata_offsets.push(self.metadata_builder.offset());
142 self.value_offsets.push(self.value_builder.offset());
143 }
144
145 /// Append the [`Variant`] to the builder as the next row
146 pub fn append_variant(&mut self, variant: Variant) {
147 let mut direct_builder = self.variant_builder();
148 direct_builder.append_value(variant);
149 direct_builder.finish()
150 }
151
152 /// Return a `VariantArrayVariantBuilder` that writes directly to the
153 /// buffers of this builder.
154 ///
155 /// You must call [`VariantArrayVariantBuilder::finish`] to complete the builder
156 ///
157 /// # Example
158 /// ```
159 /// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
160 /// # use parquet_variant_compute::{VariantArray, VariantArrayBuilder};
161 /// let mut array_builder = VariantArrayBuilder::new(10);
162 ///
163 /// // First row has a string
164 /// let mut variant_builder = array_builder.variant_builder();
165 /// variant_builder.append_value("Hello, World!");
166 /// // must call finish to write the variant to the buffers
167 /// variant_builder.finish();
168 ///
169 /// // Second row is an object
170 /// let mut variant_builder = array_builder.variant_builder();
171 /// variant_builder
172 /// .new_object()
173 /// .with_field("my_field", 42i64)
174 /// .finish();
175 /// variant_builder.finish();
176 ///
177 /// // finalize the array
178 /// let variant_array: VariantArray = array_builder.build();
179 ///
180 /// // verify what we wrote is still there
181 /// assert_eq!(variant_array.value(0), Variant::from("Hello, World!"));
182 /// assert!(variant_array.value(1).as_object().is_some());
183 /// ```
184 pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder<'_> {
185 VariantArrayVariantBuilder::new(self)
186 }
187}
188
189/// A `VariantBuilderExt` that writes directly to the buffers of a `VariantArrayBuilder`.
190///
191// This struct implements [`VariantBuilderExt`], so in most cases it can be used as a
192// [`VariantBuilder`] to perform variant-related operations for [`VariantArrayBuilder`].
193///
194/// If [`Self::finish`] is not called, any changes will be rolled back
195///
196/// See [`VariantArrayBuilder::variant_builder`] for an example
197pub struct VariantArrayVariantBuilder<'a> {
198 parent_state: ParentState<'a>,
199 metadata_offsets: &'a mut Vec<usize>,
200 value_offsets: &'a mut Vec<usize>,
201 nulls: &'a mut NullBufferBuilder,
202}
203
204impl VariantBuilderExt for VariantArrayVariantBuilder<'_> {
205 fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
206 ValueBuilder::append_variant(self.parent_state(), value.into());
207 }
208
209 fn try_new_list(&mut self) -> Result<ListBuilder<'_>, ArrowError> {
210 Ok(ListBuilder::new(self.parent_state(), false))
211 }
212
213 fn try_new_object(&mut self) -> Result<ObjectBuilder<'_>, ArrowError> {
214 Ok(ObjectBuilder::new(self.parent_state(), false))
215 }
216}
217
218impl<'a> VariantArrayVariantBuilder<'a> {
219 /// Constructs a new VariantArrayVariantBuilder
220 ///
221 /// Note this is not public as this is a structure that is logically
222 /// part of the [`VariantArrayBuilder`] and relies on its internal structure
223 fn new(builder: &'a mut VariantArrayBuilder) -> Self {
224 let parent_state =
225 ParentState::variant(&mut builder.value_builder, &mut builder.metadata_builder);
226 VariantArrayVariantBuilder {
227 parent_state,
228 metadata_offsets: &mut builder.metadata_offsets,
229 value_offsets: &mut builder.value_offsets,
230 nulls: &mut builder.nulls,
231 }
232 }
233
234 /// Called to finish the in progress variant and write it to the underlying
235 /// buffers
236 ///
237 /// Note if you do not call finish, on drop any changes made to the
238 /// underlying buffers will be rolled back.
239 pub fn finish(mut self) {
240 // Record the ending offsets after finishing metadata and finish the parent state.
241 let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders();
242 self.metadata_offsets.push(metadata_builder.finish());
243 self.value_offsets.push(value_builder.offset());
244 self.nulls.append_non_null();
245 self.parent_state.finish();
246 }
247
248 fn parent_state(&mut self) -> ParentState<'_> {
249 let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders();
250 ParentState::variant(value_builder, metadata_builder)
251 }
252}
253
254// Empty Drop to help with borrow checking - warns users if they forget to call finish()
255impl Drop for VariantArrayVariantBuilder<'_> {
256 fn drop(&mut self) {}
257}
258
259fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
260 // All offsets are less than or equal to the buffer length, so we can safely cast all offsets
261 // inside the loop below, as long as the buffer length fits in u32.
262 u32::try_from(buffer.len()).expect("buffer length should fit in u32");
263
264 let mut builder = BinaryViewBuilder::with_capacity(offsets.len());
265 let block = builder.append_block(buffer.into());
266 // TODO this can be much faster if it creates the views directly during append
267 let mut start = 0;
268 for end in offsets {
269 let end = end as u32; // Safe cast: validated max offset fits in u32 above
270 builder
271 .try_append_view(block, start, end - start)
272 .expect("Failed to append view");
273 start = end;
274 }
275 builder.finish()
276}
277
278#[cfg(test)]
279mod test {
280 use super::*;
281 use arrow::array::Array;
282
283 /// Test that both the metadata and value buffers are non nullable
284 #[test]
285 fn test_variant_array_builder_non_nullable() {
286 let mut builder = VariantArrayBuilder::new(10);
287 builder.append_null(); // should not panic
288 builder.append_variant(Variant::from(42i32));
289 let variant_array = builder.build();
290
291 assert_eq!(variant_array.len(), 2);
292 assert!(variant_array.is_null(0));
293 assert!(!variant_array.is_null(1));
294 assert_eq!(variant_array.value(1), Variant::from(42i32));
295
296 // the metadata and value fields of non shredded variants should not be null
297 assert!(variant_array.metadata_field().nulls().is_none());
298 assert!(variant_array.value_field().unwrap().nulls().is_none());
299 let DataType::Struct(fields) = variant_array.data_type() else {
300 panic!("Expected VariantArray to have Struct data type");
301 };
302 for field in fields {
303 assert!(
304 !field.is_nullable(),
305 "Field {} should be non-nullable",
306 field.name()
307 );
308 }
309 }
310
311 /// Test using sub builders to append variants
312 #[test]
313 fn test_variant_array_builder_variant_builder() {
314 let mut builder = VariantArrayBuilder::new(10);
315 builder.append_null(); // should not panic
316 builder.append_variant(Variant::from(42i32));
317
318 // let's make a sub-object in the next row
319 let mut sub_builder = builder.variant_builder();
320 sub_builder.new_object().with_field("foo", "bar").finish();
321 sub_builder.finish(); // must call finish to write the variant to the buffers
322
323 // append a new list
324 let mut sub_builder = builder.variant_builder();
325 sub_builder
326 .new_list()
327 .with_value(Variant::from(1i32))
328 .with_value(Variant::from(2i32))
329 .finish();
330 sub_builder.finish();
331 let variant_array = builder.build();
332
333 assert_eq!(variant_array.len(), 4);
334 assert!(variant_array.is_null(0));
335 assert!(!variant_array.is_null(1));
336 assert_eq!(variant_array.value(1), Variant::from(42i32));
337 assert!(!variant_array.is_null(2));
338 let variant = variant_array.value(2);
339 let variant = variant.as_object().expect("variant to be an object");
340 assert_eq!(variant.get("foo").unwrap(), Variant::from("bar"));
341 assert!(!variant_array.is_null(3));
342 let variant = variant_array.value(3);
343 let list = variant.as_list().expect("variant to be a list");
344 assert_eq!(list.len(), 2);
345 }
346
347 /// Test using non-finished sub builders to append variants
348 #[test]
349 fn test_variant_array_builder_variant_builder_reset() {
350 let mut builder = VariantArrayBuilder::new(10);
351
352 // make a sub-object in the first row
353 let mut sub_builder = builder.variant_builder();
354 sub_builder.new_object().with_field("foo", 1i32).finish();
355 sub_builder.finish(); // must call finish to write the variant to the buffers
356
357 // start appending an object but don't finish
358 let mut sub_builder = builder.variant_builder();
359 sub_builder.new_object().with_field("bar", 2i32).finish();
360 drop(sub_builder); // drop the sub builder without finishing it
361
362 // make a third sub-object (this should reset the previous unfinished object)
363 let mut sub_builder = builder.variant_builder();
364 sub_builder.new_object().with_field("baz", 3i32).finish();
365 sub_builder.finish(); // must call finish to write the variant to the buffers
366
367 let variant_array = builder.build();
368
369 // only the two finished objects should be present
370 assert_eq!(variant_array.len(), 2);
371 assert!(!variant_array.is_null(0));
372 let variant = variant_array.value(0);
373 assert_eq!(
374 variant.get_object_field("foo"),
375 Some(Variant::from(1i32)),
376 "Expected an object with field \"foo\", got: {variant:?}"
377 );
378
379 assert!(!variant_array.is_null(1));
380 let variant = variant_array.value(1);
381 assert_eq!(
382 variant.get_object_field("baz"),
383 Some(Variant::from(3i32)),
384 "Expected an object with field \"baz\", got: {variant:?}"
385 );
386 }
387}