parquet_variant_compute/
variant_array_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArrayBuilder`] implementation
19
20use crate::VariantArray;
21use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22use arrow_schema::{ArrowError, DataType, Field, Fields};
23use parquet_variant::{
24    BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt,
25    VariantMetadata,
26};
27use parquet_variant::{
28    ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder,
29};
30use std::sync::Arc;
31
32/// A builder for [`VariantArray`]
33///
34/// This builder is used to construct a `VariantArray` and allows APIs for
35/// adding metadata
36///
37/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
38/// the metadata and value fields.
39///
40/// # TODO
41/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
42///
43/// ## Example:
44/// ```
45/// # use arrow::array::Array;
46/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
47/// # use parquet_variant_compute::VariantArrayBuilder;
48/// // Create a new VariantArrayBuilder with a capacity of 100 rows
49/// let mut builder = VariantArrayBuilder::new(100);
50/// // append variant values
51/// builder.append_variant(Variant::from(42));
52/// // append a null row (note not a Variant::Null)
53/// builder.append_null();
54/// // append an object to the builder using VariantBuilderExt methods directly
55/// builder.new_object()
56///   .with_field("foo", "bar")
57///   .finish();
58///
59/// // create the final VariantArray
60/// let variant_array = builder.build();
61/// assert_eq!(variant_array.len(), 3);
62/// // // Access the values
63/// // row 1 is not null and is an integer
64/// assert!(!variant_array.is_null(0));
65/// assert_eq!(variant_array.value(0), Variant::from(42i32));
66/// // row 1 is null
67/// assert!(variant_array.is_null(1));
68/// // row 2 is not null and is an object
69/// assert!(!variant_array.is_null(2));
70/// let value = variant_array.value(2);
71/// let obj = value.as_object().expect("expected object");
72/// assert_eq!(obj.get("foo"), Some(Variant::from("bar")));
73/// ```
74#[derive(Debug)]
75pub struct VariantArrayBuilder {
76    /// Nulls
77    nulls: NullBufferBuilder,
78    /// builder for all the metadata
79    metadata_builder: WritableMetadataBuilder,
80    /// ending offset for each serialized metadata dictionary in the buffer
81    metadata_offsets: Vec<usize>,
82    /// builder for values
83    value_builder: ValueBuilder,
84    /// ending offset for each serialized variant value in the buffer
85    value_offsets: Vec<usize>,
86    /// The fields of the final `StructArray`
87    ///
88    /// TODO: 1) Add extension type metadata
89    /// TODO: 2) Add support for shredding
90    fields: Fields,
91}
92
93impl VariantArrayBuilder {
94    pub fn new(row_capacity: usize) -> Self {
95        // The subfields are expected to be non-nullable according to the parquet variant spec.
96        let metadata_field = Field::new("metadata", DataType::BinaryView, false);
97        let value_field = Field::new("value", DataType::BinaryView, false);
98
99        Self {
100            nulls: NullBufferBuilder::new(row_capacity),
101            metadata_builder: WritableMetadataBuilder::default(),
102            metadata_offsets: Vec::with_capacity(row_capacity),
103            value_builder: ValueBuilder::new(),
104            value_offsets: Vec::with_capacity(row_capacity),
105            fields: Fields::from(vec![metadata_field, value_field]),
106        }
107    }
108
109    /// Build the final builder
110    pub fn build(self) -> VariantArray {
111        let Self {
112            mut nulls,
113            metadata_builder,
114            metadata_offsets,
115            value_builder,
116            value_offsets,
117            fields,
118        } = self;
119
120        let metadata_buffer = metadata_builder.into_inner();
121        let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_offsets);
122
123        let value_buffer = value_builder.into_inner();
124        let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
125
126        // The build the final struct array
127        let inner = StructArray::new(
128            fields,
129            vec![
130                Arc::new(metadata_array) as ArrayRef,
131                Arc::new(value_array) as ArrayRef,
132            ],
133            nulls.finish(),
134        );
135        // TODO add arrow extension type metadata
136
137        VariantArray::try_new(&inner).expect("valid VariantArray by construction")
138    }
139
140    /// Appends a null row to the builder.
141    pub fn append_null(&mut self) {
142        self.nulls.append_null();
143        // The subfields are expected to be non-nullable according to the parquet variant spec.
144        self.metadata_offsets.push(self.metadata_builder.offset());
145        self.value_offsets.push(self.value_builder.offset());
146    }
147
148    /// Append the [`Variant`] to the builder as the next row
149    pub fn append_variant(&mut self, variant: Variant) {
150        ValueBuilder::append_variant(self.parent_state(), variant);
151    }
152
153    /// Creates a builder-specific parent state
154    fn parent_state(&mut self) -> ParentState<'_, ArrayBuilderState<'_>> {
155        let state = ArrayBuilderState {
156            metadata_offsets: &mut self.metadata_offsets,
157            value_offsets: &mut self.value_offsets,
158            nulls: &mut self.nulls,
159        };
160
161        ParentState::new(&mut self.value_builder, &mut self.metadata_builder, state)
162    }
163}
164
165/// Builder-specific state for array building that manages array-level offsets and nulls. See
166/// [`VariantBuilderExt`] for details.
167#[derive(Debug)]
168pub struct ArrayBuilderState<'a> {
169    metadata_offsets: &'a mut Vec<usize>,
170    value_offsets: &'a mut Vec<usize>,
171    nulls: &'a mut NullBufferBuilder,
172}
173
174// All changes are pending until finalized
175impl BuilderSpecificState for ArrayBuilderState<'_> {
176    fn finish(
177        &mut self,
178        metadata_builder: &mut dyn MetadataBuilder,
179        value_builder: &mut ValueBuilder,
180    ) {
181        self.metadata_offsets.push(metadata_builder.finish());
182        self.value_offsets.push(value_builder.offset());
183        self.nulls.append_non_null();
184    }
185}
186
187impl VariantBuilderExt for VariantArrayBuilder {
188    type State<'a>
189        = ArrayBuilderState<'a>
190    where
191        Self: 'a;
192
193    /// Appending NULL to a variant array produces an actual NULL value
194    fn append_null(&mut self) {
195        self.append_null();
196    }
197
198    fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
199        self.append_variant(value.into());
200    }
201
202    fn try_new_list(&mut self) -> Result<ListBuilder<'_, Self::State<'_>>, ArrowError> {
203        Ok(ListBuilder::new(self.parent_state(), false))
204    }
205
206    fn try_new_object(&mut self) -> Result<ObjectBuilder<'_, Self::State<'_>>, ArrowError> {
207        Ok(ObjectBuilder::new(self.parent_state(), false))
208    }
209}
210
211/// A builder for creating only the value column of a [`VariantArray`]
212///
213/// This builder is used when you have existing metadata and only need to build
214/// the value column. It's useful for scenarios like variant unshredding, data
215/// transformation, or filtering where you want to reuse existing metadata.
216///
217/// The builder produces a [`BinaryViewArray`] that can be combined with existing
218/// metadata to create a complete [`VariantArray`].
219///
220/// # Example:
221/// ```
222/// # use arrow::array::Array;
223/// # use parquet_variant::{Variant};
224/// # use parquet_variant_compute::VariantValueArrayBuilder;
225/// // Create a variant value builder for 10 rows
226/// let mut builder = VariantValueArrayBuilder::new(10);
227///
228/// // Append some values with their corresponding metadata, which the
229/// // builder takes advantage of to avoid creating new metadata.
230/// builder.append_value(Variant::from(42));
231/// builder.append_null();
232/// builder.append_value(Variant::from("hello"));
233///
234/// // Build the final value array
235/// let value_array = builder.build().unwrap();
236/// assert_eq!(value_array.len(), 3);
237/// ```
238#[derive(Debug)]
239pub struct VariantValueArrayBuilder {
240    value_builder: ValueBuilder,
241    value_offsets: Vec<usize>,
242    nulls: NullBufferBuilder,
243}
244
245impl VariantValueArrayBuilder {
246    /// Create a new `VariantValueArrayBuilder` with the specified row capacity
247    pub fn new(row_capacity: usize) -> Self {
248        Self {
249            value_builder: ValueBuilder::new(),
250            value_offsets: Vec::with_capacity(row_capacity),
251            nulls: NullBufferBuilder::new(row_capacity),
252        }
253    }
254
255    /// Build the final value array
256    ///
257    /// Returns a [`BinaryViewArray`] containing the serialized variant values.
258    /// This can be combined with existing metadata to create a complete [`VariantArray`].
259    pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> {
260        let value_buffer = self.value_builder.into_inner();
261        let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets);
262        if let Some(nulls) = self.nulls.finish() {
263            let (views, buffers, _) = array.into_parts();
264            array = BinaryViewArray::try_new(views, buffers, Some(nulls))?;
265        }
266        Ok(array)
267    }
268
269    /// Append a null row to the builder
270    ///
271    /// WARNING: It is only valid to call this method when building the `value` field of a shredded
272    /// variant column (which is nullable). The `value` field of a binary (unshredded) variant
273    /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with
274    /// `Variant::Null`, passing the appropriate metadata value.
275    pub fn append_null(&mut self) {
276        self.value_offsets.push(self.value_builder.offset());
277        self.nulls.append_null();
278    }
279
280    /// Append a variant value with its corresponding metadata
281    ///
282    /// # Arguments
283    /// * `value` - The variant value to append
284    /// * `metadata` - The metadata dictionary for this variant (used for field name resolution)
285    ///
286    /// # Returns
287    /// * `Ok(())` if the value was successfully appended
288    /// * `Err(ArrowError)` if the variant contains field names not found in the metadata
289    ///
290    /// # Example
291    /// ```
292    /// # use parquet_variant::Variant;
293    /// # use parquet_variant_compute::VariantValueArrayBuilder;
294    /// let mut builder = VariantValueArrayBuilder::new(10);
295    /// builder.append_value(Variant::from(42));
296    /// ```
297    pub fn append_value(&mut self, value: Variant<'_, '_>) {
298        // NOTE: Have to clone because the builder consumes `value`
299        self.builder_ext(&value.metadata().clone())
300            .append_value(value);
301    }
302
303    /// Creates a builder-specific parent state.
304    ///
305    /// For example, this can be useful for code that wants to copy a subset of fields from an
306    /// object `value` as a new row of `value_array_builder`:
307    ///
308    /// ```no_run
309    /// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant};
310    /// # use parquet_variant_compute::VariantValueArrayBuilder;
311    /// # let value = Variant::Null;
312    /// # let mut value_array_builder = VariantValueArrayBuilder::new(0);
313    /// # fn should_keep(field_name: &str) -> bool { todo!() };
314    /// let Variant::Object(obj) = value else {
315    ///     panic!("Not a variant object");
316    /// };
317    /// let mut metadata_builder = ReadOnlyMetadataBuilder::new(&obj.metadata);
318    /// let state = value_array_builder.parent_state(&mut metadata_builder);
319    /// let mut object_builder = ObjectBuilder::new(state, false);
320    /// for (field_name, field_value) in obj.iter() {
321    ///     if should_keep(field_name) {
322    ///         object_builder.insert_bytes(field_name, field_value);
323    ///     }
324    /// }
325    ///  object_builder.finish(); // appends the filtered object
326    /// ```
327    pub fn parent_state<'a>(
328        &'a mut self,
329        metadata_builder: &'a mut dyn MetadataBuilder,
330    ) -> ParentState<'a, ValueArrayBuilderState<'a>> {
331        let state = ValueArrayBuilderState {
332            value_offsets: &mut self.value_offsets,
333            nulls: &mut self.nulls,
334        };
335
336        ParentState::new(&mut self.value_builder, metadata_builder, state)
337    }
338
339    /// Creates a thin [`VariantBuilderExt`] wrapper for this builder, which hides the `metadata`
340    /// parameter (similar to the way [`parquet_variant::ObjectFieldBuilder`] hides field names).
341    pub fn builder_ext<'a>(
342        &'a mut self,
343        metadata: &'a VariantMetadata<'a>,
344    ) -> VariantValueArrayBuilderExt<'a> {
345        VariantValueArrayBuilderExt {
346            metadata_builder: ReadOnlyMetadataBuilder::new(metadata),
347            value_builder: self,
348        }
349    }
350}
351
352/// Builder-specific state for array building that manages array-level offsets and nulls. See
353/// [`VariantBuilderExt`] for details.
354#[derive(Debug)]
355pub struct ValueArrayBuilderState<'a> {
356    value_offsets: &'a mut Vec<usize>,
357    nulls: &'a mut NullBufferBuilder,
358}
359
360// All changes are pending until finalized
361impl BuilderSpecificState for ValueArrayBuilderState<'_> {
362    fn finish(
363        &mut self,
364        _metadata_builder: &mut dyn MetadataBuilder,
365        value_builder: &mut ValueBuilder,
366    ) {
367        self.value_offsets.push(value_builder.offset());
368        self.nulls.append_non_null();
369    }
370}
371
372/// A thin [`VariantBuilderExt`] wrapper that hides the short-lived (per-row)
373/// [`ReadOnlyMetadataBuilder`] instances that [`VariantValueArrayBuilder`] requires.
374pub struct VariantValueArrayBuilderExt<'a> {
375    metadata_builder: ReadOnlyMetadataBuilder<'a>,
376    value_builder: &'a mut VariantValueArrayBuilder,
377}
378
379impl<'a> VariantValueArrayBuilderExt<'a> {
380    /// Creates a new instance from a metadata builder and a reference to a variant value builder.
381    pub fn new(
382        metadata_builder: ReadOnlyMetadataBuilder<'a>,
383        value_builder: &'a mut VariantValueArrayBuilder,
384    ) -> Self {
385        Self {
386            metadata_builder,
387            value_builder,
388        }
389    }
390}
391
392impl<'a> VariantBuilderExt for VariantValueArrayBuilderExt<'a> {
393    type State<'b>
394        = ValueArrayBuilderState<'b>
395    where
396        Self: 'b;
397
398    fn append_null(&mut self) {
399        self.value_builder.append_null()
400    }
401
402    fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
403        let state = self.value_builder.parent_state(&mut self.metadata_builder);
404        ValueBuilder::append_variant_bytes(state, value.into());
405    }
406
407    fn try_new_list(&mut self) -> Result<ListBuilder<'_, Self::State<'_>>, ArrowError> {
408        let state = self.value_builder.parent_state(&mut self.metadata_builder);
409        Ok(ListBuilder::new(state, false))
410    }
411
412    fn try_new_object(&mut self) -> Result<ObjectBuilder<'_, Self::State<'_>>, ArrowError> {
413        let state = self.value_builder.parent_state(&mut self.metadata_builder);
414        Ok(ObjectBuilder::new(state, false))
415    }
416}
417
418fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
419    // All offsets are less than or equal to the buffer length, so we can safely cast all offsets
420    // inside the loop below, as long as the buffer length fits in u32.
421    u32::try_from(buffer.len()).expect("buffer length should fit in u32");
422
423    let mut builder = BinaryViewBuilder::with_capacity(offsets.len());
424    let block = builder.append_block(buffer.into());
425    // TODO this can be much faster if it creates the views directly during append
426    let mut start = 0;
427    for end in offsets {
428        let end = end as u32; // Safe cast: validated max offset fits in u32 above
429        builder
430            .try_append_view(block, start, end - start)
431            .expect("Failed to append view");
432        start = end;
433    }
434    builder.finish()
435}
436
437#[cfg(test)]
438mod test {
439    use super::*;
440    use arrow::array::Array;
441    use parquet_variant::Variant;
442
443    /// Test that both the metadata and value buffers are non nullable
444    #[test]
445    fn test_variant_array_builder_non_nullable() {
446        let mut builder = VariantArrayBuilder::new(10);
447        builder.append_null(); // should not panic
448        builder.append_variant(Variant::from(42i32));
449        let variant_array = builder.build();
450
451        assert_eq!(variant_array.len(), 2);
452        assert!(variant_array.is_null(0));
453        assert!(!variant_array.is_null(1));
454        assert_eq!(variant_array.value(1), Variant::from(42i32));
455
456        // the metadata and value fields of non shredded variants should not be null
457        assert!(variant_array.metadata_field().nulls().is_none());
458        assert!(variant_array.value_field().unwrap().nulls().is_none());
459        let DataType::Struct(fields) = variant_array.data_type() else {
460            panic!("Expected VariantArray to have Struct data type");
461        };
462        for field in fields {
463            assert!(
464                !field.is_nullable(),
465                "Field {} should be non-nullable",
466                field.name()
467            );
468        }
469    }
470
471    /// Test using appending variants to the array builder
472    #[test]
473    fn test_variant_array_builder() {
474        let mut builder = VariantArrayBuilder::new(10);
475        builder.append_null(); // should not panic
476        builder.append_variant(Variant::from(42i32));
477
478        // make an object in the next row
479        builder.new_object().with_field("foo", "bar").finish();
480
481        // append a new list
482        builder
483            .new_list()
484            .with_value(Variant::from(1i32))
485            .with_value(Variant::from(2i32))
486            .finish();
487        let variant_array = builder.build();
488
489        assert_eq!(variant_array.len(), 4);
490        assert!(variant_array.is_null(0));
491        assert!(!variant_array.is_null(1));
492        assert_eq!(variant_array.value(1), Variant::from(42i32));
493        assert!(!variant_array.is_null(2));
494        let variant = variant_array.value(2);
495        let variant = variant.as_object().expect("variant to be an object");
496        assert_eq!(variant.get("foo").unwrap(), Variant::from("bar"));
497        assert!(!variant_array.is_null(3));
498        let variant = variant_array.value(3);
499        let list = variant.as_list().expect("variant to be a list");
500        assert_eq!(list.len(), 2);
501    }
502
503    #[test]
504    fn test_variant_value_array_builder_basic() {
505        let mut builder = VariantValueArrayBuilder::new(10);
506
507        // Append some values
508        builder.append_value(Variant::from(42i32));
509        builder.append_null();
510        builder.append_value(Variant::from("hello"));
511
512        let value_array = builder.build().unwrap();
513        assert_eq!(value_array.len(), 3);
514    }
515
516    #[test]
517    fn test_variant_value_array_builder_with_objects() {
518        // Populate a variant array with objects
519        let mut builder = VariantArrayBuilder::new(3);
520        builder
521            .new_object()
522            .with_field("name", "Alice")
523            .with_field("age", 30i32)
524            .finish();
525
526        builder
527            .new_object()
528            .with_field("name", "Bob")
529            .with_field("age", 42i32)
530            .with_field("city", "Wonderland")
531            .finish();
532
533        builder
534            .new_object()
535            .with_field("name", "Charlie")
536            .with_field("age", 1i32)
537            .finish();
538
539        let array = builder.build();
540
541        // Copy (some of) the objects over to the value array builder
542        //
543        // NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only
544        // filter or manipulate values within a row.
545        let mut value_builder = VariantValueArrayBuilder::new(3);
546
547        // straight copy
548        value_builder.append_value(array.value(0));
549
550        // filtering fields takes more work because we need to manually create an object builder
551        let value = array.value(1);
552        let mut builder = value_builder.builder_ext(value.metadata());
553        builder
554            .new_object()
555            .with_field("name", value.get_object_field("name").unwrap())
556            .with_field("age", value.get_object_field("age").unwrap())
557            .finish();
558
559        // same bytes, but now nested and duplicated inside a list
560        let value = array.value(2);
561        let mut builder = value_builder.builder_ext(value.metadata());
562        builder
563            .new_list()
564            .with_value(value.clone())
565            .with_value(value.clone())
566            .finish();
567
568        let array2 = VariantArray::from_parts(
569            array.metadata_field().clone(),
570            Some(value_builder.build().unwrap()),
571            None,
572            None,
573        );
574
575        assert_eq!(array2.len(), 3);
576        assert_eq!(array.value(0), array2.value(0));
577
578        assert_eq!(
579            array.value(1).get_object_field("name"),
580            array2.value(1).get_object_field("name")
581        );
582        assert_eq!(
583            array.value(1).get_object_field("age"),
584            array2.value(1).get_object_field("age")
585        );
586
587        assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap());
588        assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap());
589    }
590}