parquet_variant_compute/
variant_array_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`VariantArrayBuilder`] implementation
19
20use crate::VariantArray;
21use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
22use arrow_schema::{DataType, Field, Fields};
23use parquet_variant::{Variant, VariantBuilder};
24use std::sync::Arc;
25
26/// A builder for [`VariantArray`]
27///
28/// This builder is used to construct a `VariantArray` and allows APIs for
29/// adding metadata
30///
31/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
32/// the metadata and value fields.
33///
34/// # TODO
35/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
36///
37/// ## Example:
38/// ```
39/// # use arrow::array::Array;
40/// # use parquet_variant::{Variant, VariantBuilder};
41/// # use parquet_variant_compute::VariantArrayBuilder;
42/// // Create a new VariantArrayBuilder with a capacity of 100 rows
43/// let mut builder = VariantArrayBuilder::new(100);
44/// // append variant values
45/// builder.append_variant(Variant::from(42));
46/// // append a null row
47/// builder.append_null();
48/// // append a pre-constructed metadata and value buffers
49/// let (metadata, value) = {
50///   let mut vb = VariantBuilder::new();
51///   let mut obj = vb.new_object();
52///   obj.insert("foo", "bar");
53///   obj.finish().unwrap();
54///   vb.finish()
55/// };
56/// builder.append_variant_buffers(&metadata, &value);
57///
58/// // create the final VariantArray
59/// let variant_array = builder.build();
60/// assert_eq!(variant_array.len(), 3);
61/// // // Access the values
62/// // row 1 is not null and is an integer
63/// assert!(!variant_array.is_null(0));
64/// assert_eq!(variant_array.value(0), Variant::from(42i32));
65/// // row 1 is null
66/// assert!(variant_array.is_null(1));
67/// // row 2 is not null and is an object
68/// assert!(!variant_array.is_null(2));
69/// assert!(variant_array.value(2).as_object().is_some());
70/// ```
71#[derive(Debug)]
72pub struct VariantArrayBuilder {
73    /// Nulls
74    nulls: NullBufferBuilder,
75    /// buffer for all the metadata
76    metadata_buffer: Vec<u8>,
77    /// (offset, len) pairs for locations of metadata in the buffer
78    metadata_locations: Vec<(usize, usize)>,
79    /// buffer for values
80    value_buffer: Vec<u8>,
81    /// (offset, len) pairs for locations of values in the buffer
82    value_locations: Vec<(usize, usize)>,
83    /// The fields of the final `StructArray`
84    ///
85    /// TODO: 1) Add extension type metadata
86    /// TODO: 2) Add support for shredding
87    fields: Fields,
88}
89
90impl VariantArrayBuilder {
91    pub fn new(row_capacity: usize) -> Self {
92        // The subfields are expected to be non-nullable according to the parquet variant spec.
93        let metadata_field = Field::new("metadata", DataType::BinaryView, false);
94        let value_field = Field::new("value", DataType::BinaryView, false);
95
96        Self {
97            nulls: NullBufferBuilder::new(row_capacity),
98            metadata_buffer: Vec::new(), // todo allocation capacity
99            metadata_locations: Vec::with_capacity(row_capacity),
100            value_buffer: Vec::new(),
101            value_locations: Vec::with_capacity(row_capacity),
102            fields: Fields::from(vec![metadata_field, value_field]),
103        }
104    }
105
106    /// Build the final builder
107    pub fn build(self) -> VariantArray {
108        let Self {
109            mut nulls,
110            metadata_buffer,
111            metadata_locations,
112            value_buffer,
113            value_locations,
114            fields,
115        } = self;
116
117        let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_locations);
118
119        let value_array = binary_view_array_from_buffers(value_buffer, value_locations);
120
121        // The build the final struct array
122        let inner = StructArray::new(
123            fields,
124            vec![
125                Arc::new(metadata_array) as ArrayRef,
126                Arc::new(value_array) as ArrayRef,
127            ],
128            nulls.finish(),
129        );
130        // TODO add arrow extension type metadata
131
132        VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction")
133    }
134
135    /// Appends a null row to the builder.
136    pub fn append_null(&mut self) {
137        self.nulls.append_null();
138        // The subfields are expected to be non-nullable according to the parquet variant spec.
139        let metadata_offset = self.metadata_buffer.len();
140        let metadata_length = 0;
141        self.metadata_locations
142            .push((metadata_offset, metadata_length));
143        let value_offset = self.value_buffer.len();
144        let value_length = 0;
145        self.value_locations.push((value_offset, value_length));
146    }
147
148    /// Append the [`Variant`] to the builder as the next row
149    pub fn append_variant(&mut self, variant: Variant) {
150        // TODO make this more efficient by avoiding the intermediate buffers
151        let mut variant_builder = VariantBuilder::new();
152        variant_builder.append_value(variant);
153        let (metadata, value) = variant_builder.finish();
154        self.append_variant_buffers(&metadata, &value);
155    }
156
157    /// Append a metadata and values buffer to the builder
158    pub fn append_variant_buffers(&mut self, metadata: &[u8], value: &[u8]) {
159        self.nulls.append_non_null();
160        let metadata_length = metadata.len();
161        let metadata_offset = self.metadata_buffer.len();
162        self.metadata_locations
163            .push((metadata_offset, metadata_length));
164        self.metadata_buffer.extend_from_slice(metadata);
165        let value_length = value.len();
166        let value_offset = self.value_buffer.len();
167        self.value_locations.push((value_offset, value_length));
168        self.value_buffer.extend_from_slice(value);
169    }
170
171    // TODO: Return a Variant builder that will write to the underlying buffers (TODO)
172}
173
174fn binary_view_array_from_buffers(
175    buffer: Vec<u8>,
176    locations: Vec<(usize, usize)>,
177) -> BinaryViewArray {
178    let mut builder = BinaryViewBuilder::with_capacity(locations.len());
179    let block = builder.append_block(buffer.into());
180    // TODO this can be much faster if it creates the views directly during append
181    for (offset, length) in locations {
182        let offset = offset.try_into().expect("offset should fit in u32");
183        let length = length.try_into().expect("length should fit in u32");
184        builder
185            .try_append_view(block, offset, length)
186            .expect("Failed to append view");
187    }
188    builder.finish()
189}
190
191#[cfg(test)]
192mod test {
193    use super::*;
194    use arrow::array::Array;
195
196    /// Test that both the metadata and value buffers are non nullable
197    #[test]
198    fn test_variant_array_builder_non_nullable() {
199        let mut builder = VariantArrayBuilder::new(10);
200        builder.append_null(); // should not panic
201        builder.append_variant(Variant::from(42i32));
202        let variant_array = builder.build();
203
204        assert_eq!(variant_array.len(), 2);
205        assert!(variant_array.is_null(0));
206        assert!(!variant_array.is_null(1));
207        assert_eq!(variant_array.value(1), Variant::from(42i32));
208
209        // the metadata and value fields of non shredded variants should not be null
210        assert!(variant_array.metadata_field().nulls().is_none());
211        assert!(variant_array.value_field().nulls().is_none());
212        let DataType::Struct(fields) = variant_array.data_type() else {
213            panic!("Expected VariantArray to have Struct data type");
214        };
215        for field in fields {
216            assert!(
217                !field.is_nullable(),
218                "Field {} should be non-nullable",
219                field.name()
220            );
221        }
222    }
223}