parquet_variant_compute/
variant_array_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArrayBuilder`] implementation
19
20use crate::VariantArray;
21use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22use arrow_schema::{ArrowError, DataType, Field, Fields};
23use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilderExt};
24use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder};
25use std::sync::Arc;
26
27/// A builder for [`VariantArray`]
28///
29/// This builder is used to construct a `VariantArray` and allows APIs for
30/// adding metadata
31///
32/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
33/// the metadata and value fields.
34///
35/// # TODO
36/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
37///
38/// ## Example:
39/// ```
40/// # use arrow::array::Array;
41/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
42/// # use parquet_variant_compute::VariantArrayBuilder;
43/// // Create a new VariantArrayBuilder with a capacity of 100 rows
44/// let mut builder = VariantArrayBuilder::new(100);
45/// // append variant values
46/// builder.append_variant(Variant::from(42));
47/// // append a null row (note not a Variant::Null)
48/// builder.append_null();
49/// // append an object to the builder
50/// let mut vb = builder.variant_builder();
51/// vb.new_object()
52///   .with_field("foo", "bar")
53///   .finish();
54///  vb.finish(); // must call finish to write the variant to the buffers
55///
56/// // create the final VariantArray
57/// let variant_array = builder.build();
58/// assert_eq!(variant_array.len(), 3);
59/// // // Access the values
60/// // row 1 is not null and is an integer
61/// assert!(!variant_array.is_null(0));
62/// assert_eq!(variant_array.value(0), Variant::from(42i32));
63/// // row 1 is null
64/// assert!(variant_array.is_null(1));
65/// // row 2 is not null and is an object
66/// assert!(!variant_array.is_null(2));
67/// let value = variant_array.value(2);
68/// let obj = value.as_object().expect("expected object");
69/// assert_eq!(obj.get("foo"), Some(Variant::from("bar")));
70/// ```
71#[derive(Debug)]
72pub struct VariantArrayBuilder {
73    /// Nulls
74    nulls: NullBufferBuilder,
75    /// builder for all the metadata
76    metadata_builder: WritableMetadataBuilder,
77    /// ending offset for each serialized metadata dictionary in the buffer
78    metadata_offsets: Vec<usize>,
79    /// builder for values
80    value_builder: ValueBuilder,
81    /// ending offset for each serialized variant value in the buffer
82    value_offsets: Vec<usize>,
83    /// The fields of the final `StructArray`
84    ///
85    /// TODO: 1) Add extension type metadata
86    /// TODO: 2) Add support for shredding
87    fields: Fields,
88}
89
90impl VariantArrayBuilder {
91    pub fn new(row_capacity: usize) -> Self {
92        // The subfields are expected to be non-nullable according to the parquet variant spec.
93        let metadata_field = Field::new("metadata", DataType::BinaryView, false);
94        let value_field = Field::new("value", DataType::BinaryView, false);
95
96        Self {
97            nulls: NullBufferBuilder::new(row_capacity),
98            metadata_builder: WritableMetadataBuilder::default(),
99            metadata_offsets: Vec::with_capacity(row_capacity),
100            value_builder: ValueBuilder::new(),
101            value_offsets: Vec::with_capacity(row_capacity),
102            fields: Fields::from(vec![metadata_field, value_field]),
103        }
104    }
105
106    /// Build the final builder
107    pub fn build(self) -> VariantArray {
108        let Self {
109            mut nulls,
110            metadata_builder,
111            metadata_offsets,
112            value_builder,
113            value_offsets,
114            fields,
115        } = self;
116
117        let metadata_buffer = metadata_builder.into_inner();
118        let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_offsets);
119
120        let value_buffer = value_builder.into_inner();
121        let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
122
123        // The build the final struct array
124        let inner = StructArray::new(
125            fields,
126            vec![
127                Arc::new(metadata_array) as ArrayRef,
128                Arc::new(value_array) as ArrayRef,
129            ],
130            nulls.finish(),
131        );
132        // TODO add arrow extension type metadata
133
134        VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction")
135    }
136
137    /// Appends a null row to the builder.
138    pub fn append_null(&mut self) {
139        self.nulls.append_null();
140        // The subfields are expected to be non-nullable according to the parquet variant spec.
141        self.metadata_offsets.push(self.metadata_builder.offset());
142        self.value_offsets.push(self.value_builder.offset());
143    }
144
145    /// Append the [`Variant`] to the builder as the next row
146    pub fn append_variant(&mut self, variant: Variant) {
147        let mut direct_builder = self.variant_builder();
148        direct_builder.append_value(variant);
149        direct_builder.finish()
150    }
151
152    /// Return a `VariantArrayVariantBuilder` that writes directly to the
153    /// buffers of this builder.
154    ///
155    /// You must call [`VariantArrayVariantBuilder::finish`] to complete the builder
156    ///
157    /// # Example
158    /// ```
159    /// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
160    /// # use parquet_variant_compute::{VariantArray, VariantArrayBuilder};
161    /// let mut array_builder = VariantArrayBuilder::new(10);
162    ///
163    /// // First row has a string
164    /// let mut variant_builder = array_builder.variant_builder();
165    /// variant_builder.append_value("Hello, World!");
166    /// // must call finish to write the variant to the buffers
167    /// variant_builder.finish();
168    ///
169    /// // Second row is an object
170    /// let mut variant_builder = array_builder.variant_builder();
171    /// variant_builder
172    ///     .new_object()
173    ///     .with_field("my_field", 42i64)
174    ///     .finish();
175    /// variant_builder.finish();
176    ///
177    /// // finalize the array
178    /// let variant_array: VariantArray = array_builder.build();
179    ///
180    /// // verify what we wrote is still there
181    /// assert_eq!(variant_array.value(0), Variant::from("Hello, World!"));
182    /// assert!(variant_array.value(1).as_object().is_some());
183    ///  ```
184    pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder<'_> {
185        VariantArrayVariantBuilder::new(self)
186    }
187}
188
189/// A `VariantBuilderExt` that writes directly to the buffers of a `VariantArrayBuilder`.
190///
191// This struct implements [`VariantBuilderExt`], so in most cases it can be used as a
192// [`VariantBuilder`] to perform variant-related operations for [`VariantArrayBuilder`].
193///
194/// If [`Self::finish`] is not called, any changes will be rolled back
195///
196/// See [`VariantArrayBuilder::variant_builder`] for an example
197pub struct VariantArrayVariantBuilder<'a> {
198    parent_state: ParentState<'a>,
199    metadata_offsets: &'a mut Vec<usize>,
200    value_offsets: &'a mut Vec<usize>,
201    nulls: &'a mut NullBufferBuilder,
202}
203
204impl VariantBuilderExt for VariantArrayVariantBuilder<'_> {
205    fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
206        ValueBuilder::append_variant(self.parent_state(), value.into());
207    }
208
209    fn try_new_list(&mut self) -> Result<ListBuilder<'_>, ArrowError> {
210        Ok(ListBuilder::new(self.parent_state(), false))
211    }
212
213    fn try_new_object(&mut self) -> Result<ObjectBuilder<'_>, ArrowError> {
214        Ok(ObjectBuilder::new(self.parent_state(), false))
215    }
216}
217
218impl<'a> VariantArrayVariantBuilder<'a> {
219    /// Constructs a new VariantArrayVariantBuilder
220    ///
221    /// Note this is not public as this is a structure that is logically
222    /// part of the [`VariantArrayBuilder`] and relies on its internal structure
223    fn new(builder: &'a mut VariantArrayBuilder) -> Self {
224        let parent_state =
225            ParentState::variant(&mut builder.value_builder, &mut builder.metadata_builder);
226        VariantArrayVariantBuilder {
227            parent_state,
228            metadata_offsets: &mut builder.metadata_offsets,
229            value_offsets: &mut builder.value_offsets,
230            nulls: &mut builder.nulls,
231        }
232    }
233
234    /// Called to finish the in progress variant and write it to the underlying
235    /// buffers
236    ///
237    /// Note if you do not call finish, on drop any changes made to the
238    /// underlying buffers will be rolled back.
239    pub fn finish(mut self) {
240        // Record the ending offsets after finishing metadata and finish the parent state.
241        let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders();
242        self.metadata_offsets.push(metadata_builder.finish());
243        self.value_offsets.push(value_builder.offset());
244        self.nulls.append_non_null();
245        self.parent_state.finish();
246    }
247
248    fn parent_state(&mut self) -> ParentState<'_> {
249        let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders();
250        ParentState::variant(value_builder, metadata_builder)
251    }
252}
253
254// Empty Drop to help with borrow checking - warns users if they forget to call finish()
255impl Drop for VariantArrayVariantBuilder<'_> {
256    fn drop(&mut self) {}
257}
258
259fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
260    // All offsets are less than or equal to the buffer length, so we can safely cast all offsets
261    // inside the loop below, as long as the buffer length fits in u32.
262    u32::try_from(buffer.len()).expect("buffer length should fit in u32");
263
264    let mut builder = BinaryViewBuilder::with_capacity(offsets.len());
265    let block = builder.append_block(buffer.into());
266    // TODO this can be much faster if it creates the views directly during append
267    let mut start = 0;
268    for end in offsets {
269        let end = end as u32; // Safe cast: validated max offset fits in u32 above
270        builder
271            .try_append_view(block, start, end - start)
272            .expect("Failed to append view");
273        start = end;
274    }
275    builder.finish()
276}
277
278#[cfg(test)]
279mod test {
280    use super::*;
281    use arrow::array::Array;
282
283    /// Test that both the metadata and value buffers are non nullable
284    #[test]
285    fn test_variant_array_builder_non_nullable() {
286        let mut builder = VariantArrayBuilder::new(10);
287        builder.append_null(); // should not panic
288        builder.append_variant(Variant::from(42i32));
289        let variant_array = builder.build();
290
291        assert_eq!(variant_array.len(), 2);
292        assert!(variant_array.is_null(0));
293        assert!(!variant_array.is_null(1));
294        assert_eq!(variant_array.value(1), Variant::from(42i32));
295
296        // the metadata and value fields of non shredded variants should not be null
297        assert!(variant_array.metadata_field().nulls().is_none());
298        assert!(variant_array.value_field().unwrap().nulls().is_none());
299        let DataType::Struct(fields) = variant_array.data_type() else {
300            panic!("Expected VariantArray to have Struct data type");
301        };
302        for field in fields {
303            assert!(
304                !field.is_nullable(),
305                "Field {} should be non-nullable",
306                field.name()
307            );
308        }
309    }
310
311    /// Test using sub builders to append variants
312    #[test]
313    fn test_variant_array_builder_variant_builder() {
314        let mut builder = VariantArrayBuilder::new(10);
315        builder.append_null(); // should not panic
316        builder.append_variant(Variant::from(42i32));
317
318        // let's make a sub-object in the next row
319        let mut sub_builder = builder.variant_builder();
320        sub_builder.new_object().with_field("foo", "bar").finish();
321        sub_builder.finish(); // must call finish to write the variant to the buffers
322
323        // append a new list
324        let mut sub_builder = builder.variant_builder();
325        sub_builder
326            .new_list()
327            .with_value(Variant::from(1i32))
328            .with_value(Variant::from(2i32))
329            .finish();
330        sub_builder.finish();
331        let variant_array = builder.build();
332
333        assert_eq!(variant_array.len(), 4);
334        assert!(variant_array.is_null(0));
335        assert!(!variant_array.is_null(1));
336        assert_eq!(variant_array.value(1), Variant::from(42i32));
337        assert!(!variant_array.is_null(2));
338        let variant = variant_array.value(2);
339        let variant = variant.as_object().expect("variant to be an object");
340        assert_eq!(variant.get("foo").unwrap(), Variant::from("bar"));
341        assert!(!variant_array.is_null(3));
342        let variant = variant_array.value(3);
343        let list = variant.as_list().expect("variant to be a list");
344        assert_eq!(list.len(), 2);
345    }
346
347    /// Test using non-finished sub builders to append variants
348    #[test]
349    fn test_variant_array_builder_variant_builder_reset() {
350        let mut builder = VariantArrayBuilder::new(10);
351
352        // make a sub-object in the first row
353        let mut sub_builder = builder.variant_builder();
354        sub_builder.new_object().with_field("foo", 1i32).finish();
355        sub_builder.finish(); // must call finish to write the variant to the buffers
356
357        // start appending an object but don't finish
358        let mut sub_builder = builder.variant_builder();
359        sub_builder.new_object().with_field("bar", 2i32).finish();
360        drop(sub_builder); // drop the sub builder without finishing it
361
362        // make a third sub-object (this should reset the previous unfinished object)
363        let mut sub_builder = builder.variant_builder();
364        sub_builder.new_object().with_field("baz", 3i32).finish();
365        sub_builder.finish(); // must call finish to write the variant to the buffers
366
367        let variant_array = builder.build();
368
369        // only the two finished objects should be present
370        assert_eq!(variant_array.len(), 2);
371        assert!(!variant_array.is_null(0));
372        let variant = variant_array.value(0);
373        assert_eq!(
374            variant.get_object_field("foo"),
375            Some(Variant::from(1i32)),
376            "Expected an object with field \"foo\", got: {variant:?}"
377        );
378
379        assert!(!variant_array.is_null(1));
380        let variant = variant_array.value(1);
381        assert_eq!(
382            variant.get_object_field("baz"),
383            Some(Variant::from(3i32)),
384            "Expected an object with field \"baz\", got: {variant:?}"
385        );
386    }
387}