parquet_variant_compute/
variant_array_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArrayBuilder`] implementation
19
20use crate::VariantArray;
21use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22use arrow_schema::{ArrowError, DataType, Field, Fields};
23use parquet_variant::{
24    BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt,
25    VariantMetadata,
26};
27use parquet_variant::{
28    ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder,
29};
30use std::sync::Arc;
31
32/// A builder for [`VariantArray`]
33///
34/// This builder is used to construct a `VariantArray` and allows APIs for
35/// adding metadata
36///
37/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
38/// the metadata and value fields.
39///
40/// # TODO
41/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
42///
43/// ## Example:
44/// ```
45/// # use arrow::array::Array;
46/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
47/// # use parquet_variant_compute::VariantArrayBuilder;
48/// # use parquet_variant::ShortString;
49/// // Create a new VariantArrayBuilder with a capacity of 100 rows
50/// let mut builder = VariantArrayBuilder::new(100);
51/// // append variant values
52/// builder.append_variant(Variant::from(42));
53/// // append a null row (note not a Variant::Null)
54/// builder.append_null();
55/// // append an object to the builder using VariantBuilderExt methods directly
56/// builder.new_object()
57///   .with_field("foo", "bar")
58///   .finish();
59///
60/// // bulk insert a list of values
61/// // `Option::None` is a null value
62/// builder.extend([None, Some(Variant::from("norm"))]);
63///
64/// // create the final VariantArray
65/// let variant_array = builder.build();
66/// assert_eq!(variant_array.len(), 5);
67/// // // Access the values
68/// // row 1 is not null and is an integer
69/// assert!(!variant_array.is_null(0));
70/// assert_eq!(variant_array.value(0), Variant::from(42i32));
71/// // row 1 is null
72/// assert!(variant_array.is_null(1));
73/// // row 2 is not null and is an object
74/// assert!(!variant_array.is_null(2));
75/// let value = variant_array.value(2);
76/// let obj = value.as_object().expect("expected object");
77/// assert_eq!(obj.get("foo"), Some(Variant::from("bar")));
78/// // row 3 is null
79/// assert!(variant_array.is_null(3));
80/// // row 4 is not null and is a short string
81/// assert!(!variant_array.is_null(4));
82/// let value = variant_array.value(4);
83/// assert_eq!(value, Variant::ShortString(ShortString::try_new("norm").unwrap()));
84/// ```
85#[derive(Debug)]
86pub struct VariantArrayBuilder {
87    /// Nulls
88    nulls: NullBufferBuilder,
89    /// builder for all the metadata
90    metadata_builder: WritableMetadataBuilder,
91    /// ending offset for each serialized metadata dictionary in the buffer
92    metadata_offsets: Vec<usize>,
93    /// builder for values
94    value_builder: ValueBuilder,
95    /// ending offset for each serialized variant value in the buffer
96    value_offsets: Vec<usize>,
97    /// The fields of the final `StructArray`
98    ///
99    /// TODO: 1) Add extension type metadata
100    /// TODO: 2) Add support for shredding
101    fields: Fields,
102}
103
104impl VariantArrayBuilder {
105    pub fn new(row_capacity: usize) -> Self {
106        // The subfields are expected to be non-nullable according to the parquet variant spec.
107        let metadata_field = Field::new("metadata", DataType::BinaryView, false);
108        let value_field = Field::new("value", DataType::BinaryView, false);
109
110        Self {
111            nulls: NullBufferBuilder::new(row_capacity),
112            metadata_builder: WritableMetadataBuilder::default(),
113            metadata_offsets: Vec::with_capacity(row_capacity),
114            value_builder: ValueBuilder::new(),
115            value_offsets: Vec::with_capacity(row_capacity),
116            fields: Fields::from(vec![metadata_field, value_field]),
117        }
118    }
119
120    /// Build the final builder
121    pub fn build(self) -> VariantArray {
122        let Self {
123            mut nulls,
124            metadata_builder,
125            metadata_offsets,
126            value_builder,
127            value_offsets,
128            fields,
129        } = self;
130
131        let metadata_buffer = metadata_builder.into_inner();
132        let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_offsets);
133
134        let value_buffer = value_builder.into_inner();
135        let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
136
137        // The build the final struct array
138        let inner = StructArray::new(
139            fields,
140            vec![
141                Arc::new(metadata_array) as ArrayRef,
142                Arc::new(value_array) as ArrayRef,
143            ],
144            nulls.finish(),
145        );
146        // TODO add arrow extension type metadata
147
148        VariantArray::try_new(&inner).expect("valid VariantArray by construction")
149    }
150
151    /// Appends a null row to the builder.
152    pub fn append_null(&mut self) {
153        self.nulls.append_null();
154        // The subfields are expected to be non-nullable according to the parquet variant spec.
155        self.metadata_offsets.push(self.metadata_builder.offset());
156        self.value_offsets.push(self.value_builder.offset());
157    }
158
159    /// Append the [`Variant`] to the builder as the next row
160    pub fn append_variant(&mut self, variant: Variant) {
161        ValueBuilder::append_variant(self.parent_state(), variant);
162    }
163
164    /// Creates a builder-specific parent state
165    fn parent_state(&mut self) -> ParentState<'_, ArrayBuilderState<'_>> {
166        let state = ArrayBuilderState {
167            metadata_offsets: &mut self.metadata_offsets,
168            value_offsets: &mut self.value_offsets,
169            nulls: &mut self.nulls,
170        };
171
172        ParentState::new(&mut self.value_builder, &mut self.metadata_builder, state)
173    }
174}
175
176impl<'m, 'v> Extend<Option<Variant<'m, 'v>>> for VariantArrayBuilder {
177    fn extend<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(&mut self, iter: T) {
178        for v in iter {
179            match v {
180                Some(v) => self.append_variant(v),
181                None => self.append_null(),
182            }
183        }
184    }
185}
186
187/// Builder-specific state for array building that manages array-level offsets and nulls. See
188/// [`VariantBuilderExt`] for details.
189#[derive(Debug)]
190pub struct ArrayBuilderState<'a> {
191    metadata_offsets: &'a mut Vec<usize>,
192    value_offsets: &'a mut Vec<usize>,
193    nulls: &'a mut NullBufferBuilder,
194}
195
196// All changes are pending until finalized
197impl BuilderSpecificState for ArrayBuilderState<'_> {
198    fn finish(
199        &mut self,
200        metadata_builder: &mut dyn MetadataBuilder,
201        value_builder: &mut ValueBuilder,
202    ) {
203        self.metadata_offsets.push(metadata_builder.finish());
204        self.value_offsets.push(value_builder.offset());
205        self.nulls.append_non_null();
206    }
207}
208
209impl VariantBuilderExt for VariantArrayBuilder {
210    type State<'a>
211        = ArrayBuilderState<'a>
212    where
213        Self: 'a;
214
215    /// Appending NULL to a variant array produces an actual NULL value
216    fn append_null(&mut self) {
217        self.append_null();
218    }
219
220    fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
221        self.append_variant(value.into());
222    }
223
224    fn try_new_list(&mut self) -> Result<ListBuilder<'_, Self::State<'_>>, ArrowError> {
225        Ok(ListBuilder::new(self.parent_state(), false))
226    }
227
228    fn try_new_object(&mut self) -> Result<ObjectBuilder<'_, Self::State<'_>>, ArrowError> {
229        Ok(ObjectBuilder::new(self.parent_state(), false))
230    }
231}
232
233/// A builder for creating only the value column of a [`VariantArray`]
234///
235/// This builder is used when you have existing metadata and only need to build
236/// the value column. It's useful for scenarios like variant unshredding, data
237/// transformation, or filtering where you want to reuse existing metadata.
238///
239/// The builder produces a [`BinaryViewArray`] that can be combined with existing
240/// metadata to create a complete [`VariantArray`].
241///
242/// # Example:
243/// ```
244/// # use arrow::array::Array;
245/// # use parquet_variant::{Variant};
246/// # use parquet_variant_compute::VariantValueArrayBuilder;
247/// // Create a variant value builder for 10 rows
248/// let mut builder = VariantValueArrayBuilder::new(10);
249///
250/// // Append some values with their corresponding metadata, which the
251/// // builder takes advantage of to avoid creating new metadata.
252/// builder.append_value(Variant::from(42));
253/// builder.append_null();
254/// builder.append_value(Variant::from("hello"));
255///
256/// // Build the final value array
257/// let value_array = builder.build().unwrap();
258/// assert_eq!(value_array.len(), 3);
259/// ```
260#[derive(Debug)]
261pub struct VariantValueArrayBuilder {
262    value_builder: ValueBuilder,
263    value_offsets: Vec<usize>,
264    nulls: NullBufferBuilder,
265}
266
267impl VariantValueArrayBuilder {
268    /// Create a new `VariantValueArrayBuilder` with the specified row capacity
269    pub fn new(row_capacity: usize) -> Self {
270        Self {
271            value_builder: ValueBuilder::new(),
272            value_offsets: Vec::with_capacity(row_capacity),
273            nulls: NullBufferBuilder::new(row_capacity),
274        }
275    }
276
277    /// Build the final value array
278    ///
279    /// Returns a [`BinaryViewArray`] containing the serialized variant values.
280    /// This can be combined with existing metadata to create a complete [`VariantArray`].
281    pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> {
282        let value_buffer = self.value_builder.into_inner();
283        let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets);
284        if let Some(nulls) = self.nulls.finish() {
285            let (views, buffers, _) = array.into_parts();
286            array = BinaryViewArray::try_new(views, buffers, Some(nulls))?;
287        }
288        Ok(array)
289    }
290
291    /// Append a null row to the builder
292    ///
293    /// WARNING: It is only valid to call this method when building the `value` field of a shredded
294    /// variant column (which is nullable). The `value` field of a binary (unshredded) variant
295    /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with
296    /// `Variant::Null`, passing the appropriate metadata value.
297    pub fn append_null(&mut self) {
298        self.value_offsets.push(self.value_builder.offset());
299        self.nulls.append_null();
300    }
301
302    /// Append a variant value with its corresponding metadata
303    ///
304    /// # Arguments
305    /// * `value` - The variant value to append
306    /// * `metadata` - The metadata dictionary for this variant (used for field name resolution)
307    ///
308    /// # Returns
309    /// * `Ok(())` if the value was successfully appended
310    /// * `Err(ArrowError)` if the variant contains field names not found in the metadata
311    ///
312    /// # Example
313    /// ```
314    /// # use parquet_variant::Variant;
315    /// # use parquet_variant_compute::VariantValueArrayBuilder;
316    /// let mut builder = VariantValueArrayBuilder::new(10);
317    /// builder.append_value(Variant::from(42));
318    /// ```
319    pub fn append_value(&mut self, value: Variant<'_, '_>) {
320        // NOTE: Have to clone because the builder consumes `value`
321        self.builder_ext(&value.metadata().clone())
322            .append_value(value);
323    }
324
325    /// Creates a builder-specific parent state.
326    ///
327    /// For example, this can be useful for code that wants to copy a subset of fields from an
328    /// object `value` as a new row of `value_array_builder`:
329    ///
330    /// ```no_run
331    /// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant};
332    /// # use parquet_variant_compute::VariantValueArrayBuilder;
333    /// # let value = Variant::Null;
334    /// # let mut value_array_builder = VariantValueArrayBuilder::new(0);
335    /// # fn should_keep(field_name: &str) -> bool { todo!() };
336    /// let Variant::Object(obj) = value else {
337    ///     panic!("Not a variant object");
338    /// };
339    /// let mut metadata_builder = ReadOnlyMetadataBuilder::new(&obj.metadata);
340    /// let state = value_array_builder.parent_state(&mut metadata_builder);
341    /// let mut object_builder = ObjectBuilder::new(state, false);
342    /// for (field_name, field_value) in obj.iter() {
343    ///     if should_keep(field_name) {
344    ///         object_builder.insert_bytes(field_name, field_value);
345    ///     }
346    /// }
347    ///  object_builder.finish(); // appends the filtered object
348    /// ```
349    pub fn parent_state<'a>(
350        &'a mut self,
351        metadata_builder: &'a mut dyn MetadataBuilder,
352    ) -> ParentState<'a, ValueArrayBuilderState<'a>> {
353        let state = ValueArrayBuilderState {
354            value_offsets: &mut self.value_offsets,
355            nulls: &mut self.nulls,
356        };
357
358        ParentState::new(&mut self.value_builder, metadata_builder, state)
359    }
360
361    /// Creates a thin [`VariantBuilderExt`] wrapper for this builder, which hides the `metadata`
362    /// parameter (similar to the way [`parquet_variant::ObjectFieldBuilder`] hides field names).
363    pub fn builder_ext<'a>(
364        &'a mut self,
365        metadata: &'a VariantMetadata<'a>,
366    ) -> VariantValueArrayBuilderExt<'a> {
367        VariantValueArrayBuilderExt {
368            metadata_builder: ReadOnlyMetadataBuilder::new(metadata),
369            value_builder: self,
370        }
371    }
372}
373
374/// Builder-specific state for array building that manages array-level offsets and nulls. See
375/// [`VariantBuilderExt`] for details.
376#[derive(Debug)]
377pub struct ValueArrayBuilderState<'a> {
378    value_offsets: &'a mut Vec<usize>,
379    nulls: &'a mut NullBufferBuilder,
380}
381
382// All changes are pending until finalized
383impl BuilderSpecificState for ValueArrayBuilderState<'_> {
384    fn finish(
385        &mut self,
386        _metadata_builder: &mut dyn MetadataBuilder,
387        value_builder: &mut ValueBuilder,
388    ) {
389        self.value_offsets.push(value_builder.offset());
390        self.nulls.append_non_null();
391    }
392}
393
394/// A thin [`VariantBuilderExt`] wrapper that hides the short-lived (per-row)
395/// [`ReadOnlyMetadataBuilder`] instances that [`VariantValueArrayBuilder`] requires.
396pub struct VariantValueArrayBuilderExt<'a> {
397    metadata_builder: ReadOnlyMetadataBuilder<'a>,
398    value_builder: &'a mut VariantValueArrayBuilder,
399}
400
401impl<'a> VariantValueArrayBuilderExt<'a> {
402    /// Creates a new instance from a metadata builder and a reference to a variant value builder.
403    pub fn new(
404        metadata_builder: ReadOnlyMetadataBuilder<'a>,
405        value_builder: &'a mut VariantValueArrayBuilder,
406    ) -> Self {
407        Self {
408            metadata_builder,
409            value_builder,
410        }
411    }
412}
413
414impl<'a> VariantBuilderExt for VariantValueArrayBuilderExt<'a> {
415    type State<'b>
416        = ValueArrayBuilderState<'b>
417    where
418        Self: 'b;
419
420    fn append_null(&mut self) {
421        self.value_builder.append_null()
422    }
423
424    fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
425        let state = self.value_builder.parent_state(&mut self.metadata_builder);
426        ValueBuilder::append_variant_bytes(state, value.into());
427    }
428
429    fn try_new_list(&mut self) -> Result<ListBuilder<'_, Self::State<'_>>, ArrowError> {
430        let state = self.value_builder.parent_state(&mut self.metadata_builder);
431        Ok(ListBuilder::new(state, false))
432    }
433
434    fn try_new_object(&mut self) -> Result<ObjectBuilder<'_, Self::State<'_>>, ArrowError> {
435        let state = self.value_builder.parent_state(&mut self.metadata_builder);
436        Ok(ObjectBuilder::new(state, false))
437    }
438}
439
440fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
441    // All offsets are less than or equal to the buffer length, so we can safely cast all offsets
442    // inside the loop below, as long as the buffer length fits in u32.
443    u32::try_from(buffer.len()).expect("buffer length should fit in u32");
444
445    let mut builder = BinaryViewBuilder::with_capacity(offsets.len());
446    let block = builder.append_block(buffer.into());
447    // TODO this can be much faster if it creates the views directly during append
448    let mut start = 0;
449    for end in offsets {
450        let end = end as u32; // Safe cast: validated max offset fits in u32 above
451        builder
452            .try_append_view(block, start, end - start)
453            .expect("Failed to append view");
454        start = end;
455    }
456    builder.finish()
457}
458
459#[cfg(test)]
460mod test {
461    use super::*;
462    use arrow::array::Array;
463    use parquet_variant::{ShortString, Variant};
464
465    /// Test that both the metadata and value buffers are non nullable
466    #[test]
467    fn test_variant_array_builder_non_nullable() {
468        let mut builder = VariantArrayBuilder::new(10);
469
470        builder.extend([
471            None, // should not panic
472            Some(Variant::from(42_i32)),
473        ]);
474
475        let variant_array = builder.build();
476
477        assert_eq!(variant_array.len(), 2);
478        assert!(variant_array.is_null(0));
479        assert!(!variant_array.is_null(1));
480        assert_eq!(variant_array.value(1), Variant::from(42i32));
481
482        // the metadata and value fields of non shredded variants should not be null
483        assert!(variant_array.metadata_field().nulls().is_none());
484        assert!(variant_array.value_field().unwrap().nulls().is_none());
485        let DataType::Struct(fields) = variant_array.data_type() else {
486            panic!("Expected VariantArray to have Struct data type");
487        };
488        for field in fields {
489            assert!(
490                !field.is_nullable(),
491                "Field {} should be non-nullable",
492                field.name()
493            );
494        }
495    }
496
497    /// Test using appending variants to the array builder
498    #[test]
499    fn test_variant_array_builder() {
500        let mut builder = VariantArrayBuilder::new(10);
501        builder.append_null(); // should not panic
502        builder.append_variant(Variant::from(42i32));
503
504        // make an object in the next row
505        builder.new_object().with_field("foo", "bar").finish();
506
507        // append a new list
508        builder
509            .new_list()
510            .with_value(Variant::from(1i32))
511            .with_value(Variant::from(2i32))
512            .finish();
513        let variant_array = builder.build();
514
515        assert_eq!(variant_array.len(), 4);
516        assert!(variant_array.is_null(0));
517        assert!(!variant_array.is_null(1));
518        assert_eq!(variant_array.value(1), Variant::from(42i32));
519        assert!(!variant_array.is_null(2));
520        let variant = variant_array.value(2);
521        let variant = variant.as_object().expect("variant to be an object");
522        assert_eq!(variant.get("foo").unwrap(), Variant::from("bar"));
523        assert!(!variant_array.is_null(3));
524        let variant = variant_array.value(3);
525        let list = variant.as_list().expect("variant to be a list");
526        assert_eq!(list.len(), 2);
527    }
528
529    #[test]
530    fn test_extend_variant_array_builder() {
531        let mut b = VariantArrayBuilder::new(3);
532        b.extend([None, Some(Variant::Null), Some(Variant::from("norm"))]);
533
534        let variant_array = b.build();
535
536        assert_eq!(variant_array.len(), 3);
537        assert!(variant_array.is_null(0));
538        assert_eq!(variant_array.value(1), Variant::Null);
539        assert_eq!(
540            variant_array.value(2),
541            Variant::ShortString(ShortString::try_new("norm").unwrap())
542        );
543    }
544
545    #[test]
546    fn test_variant_value_array_builder_basic() {
547        let mut builder = VariantValueArrayBuilder::new(10);
548
549        // Append some values
550        builder.append_value(Variant::from(42i32));
551        builder.append_null();
552        builder.append_value(Variant::from("hello"));
553
554        let value_array = builder.build().unwrap();
555        assert_eq!(value_array.len(), 3);
556    }
557
558    #[test]
559    fn test_variant_value_array_builder_with_objects() {
560        // Populate a variant array with objects
561        let mut builder = VariantArrayBuilder::new(3);
562        builder
563            .new_object()
564            .with_field("name", "Alice")
565            .with_field("age", 30i32)
566            .finish();
567
568        builder
569            .new_object()
570            .with_field("name", "Bob")
571            .with_field("age", 42i32)
572            .with_field("city", "Wonderland")
573            .finish();
574
575        builder
576            .new_object()
577            .with_field("name", "Charlie")
578            .with_field("age", 1i32)
579            .finish();
580
581        let array = builder.build();
582
583        // Copy (some of) the objects over to the value array builder
584        //
585        // NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only
586        // filter or manipulate values within a row.
587        let mut value_builder = VariantValueArrayBuilder::new(3);
588
589        // straight copy
590        value_builder.append_value(array.value(0));
591
592        // filtering fields takes more work because we need to manually create an object builder
593        let value = array.value(1);
594        let mut builder = value_builder.builder_ext(value.metadata());
595        builder
596            .new_object()
597            .with_field("name", value.get_object_field("name").unwrap())
598            .with_field("age", value.get_object_field("age").unwrap())
599            .finish();
600
601        // same bytes, but now nested and duplicated inside a list
602        let value = array.value(2);
603        let mut builder = value_builder.builder_ext(value.metadata());
604        builder
605            .new_list()
606            .with_value(value.clone())
607            .with_value(value.clone())
608            .finish();
609
610        let array2 = VariantArray::from_parts(
611            array.metadata_field().clone(),
612            Some(value_builder.build().unwrap()),
613            None,
614            None,
615        );
616
617        assert_eq!(array2.len(), 3);
618        assert_eq!(array.value(0), array2.value(0));
619
620        assert_eq!(
621            array.value(1).get_object_field("name"),
622            array2.value(1).get_object_field("name")
623        );
624        assert_eq!(
625            array.value(1).get_object_field("age"),
626            array2.value(1).get_object_field("age")
627        );
628
629        assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap());
630        assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap());
631    }
632}