parquet_variant/builder/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::collections::HashMap;
19
20use arrow_schema::ArrowError;
21use indexmap::IndexSet;
22
23use crate::{VariantMetadata, int_size};
24
25/// Write little-endian integer to buffer
26fn write_offset(buf: &mut Vec<u8>, value: usize, nbytes: u8) {
27    let bytes = value.to_le_bytes();
28    buf.extend_from_slice(&bytes[..nbytes as usize]);
29}
30
31/// A trait for building variant metadata dictionaries, to be used in conjunction with a
32/// [`ValueBuilder`]. The trait provides methods for managing field names and their IDs, as well as
33/// rolling back a failed builder operation that might have created new field ids.
34///
35/// [`ValueBuilder`]: crate::builder::ValueBuilder
36pub trait MetadataBuilder: std::fmt::Debug {
37    /// Attempts to register a field name, returning the corresponding (possibly newly-created)
38    /// field id on success. Attempting to register the same field name twice will _generally_
39    /// produce the same field id both times, but the variant spec does not actually require it.
40    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError>;
41
42    /// Retrieves the field name for a given field id, which must be less than
43    /// [`Self::num_field_names`]. Panics if the field id is out of bounds.
44    fn field_name(&self, field_id: usize) -> &str;
45
46    /// Returns the number of field names stored in this metadata builder. Any number less than this
47    /// is a valid field id. The builder can be reverted back to this size later on (discarding any
48    /// newer/higher field ids) by calling [`Self::truncate_field_names`].
49    fn num_field_names(&self) -> usize;
50
51    /// Reverts the field names to a previous size, discarding any newly out of bounds field ids.
52    fn truncate_field_names(&mut self, new_size: usize);
53
54    /// Finishes the current metadata dictionary, returning the new size of the underlying buffer.
55    fn finish(&mut self) -> usize;
56}
57
58impl MetadataBuilder for WritableMetadataBuilder {
59    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError> {
60        Ok(self.upsert_field_name(field_name))
61    }
62    fn field_name(&self, field_id: usize) -> &str {
63        self.field_name(field_id)
64    }
65    fn num_field_names(&self) -> usize {
66        self.num_field_names()
67    }
68    fn truncate_field_names(&mut self, new_size: usize) {
69        self.field_names.truncate(new_size)
70    }
71    fn finish(&mut self) -> usize {
72        self.finish()
73    }
74}
75
76/// A metadata builder that cannot register new field names, and merely returns the field id
77/// associated with a known field name. This is useful for variant unshredding operations, where the
78/// metadata column is fixed and -- per variant shredding spec -- already contains all field names
79/// from the typed_value column. It is also useful when projecting a subset of fields from a variant
80/// object value, since the bytes can be copied across directly without re-encoding their field ids.
81///
82/// NOTE: [`Self::finish`] is a no-op. If the intent is to make a copy of the underlying bytes each
83/// time `finish` is called, a different trait impl will be needed.
84#[derive(Debug)]
85pub struct ReadOnlyMetadataBuilder<'m> {
86    metadata: &'m VariantMetadata<'m>,
87    // A cache that tracks field names this builder has already seen, because finding the field id
88    // for a given field name is expensive -- O(n) for a large and unsorted metadata dictionary.
89    known_field_names: HashMap<&'m str, u32>,
90}
91
92impl<'m> ReadOnlyMetadataBuilder<'m> {
93    /// Creates a new read-only metadata builder from the given metadata dictionary.
94    pub fn new(metadata: &'m VariantMetadata<'m>) -> Self {
95        Self {
96            metadata,
97            known_field_names: HashMap::new(),
98        }
99    }
100}
101
102impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> {
103    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError> {
104        if let Some(field_id) = self.known_field_names.get(field_name) {
105            return Ok(*field_id);
106        }
107
108        let Some((field_id, field_name)) = self.metadata.get_entry(field_name) else {
109            return Err(ArrowError::InvalidArgumentError(format!(
110                "Field name '{field_name}' not found in metadata dictionary"
111            )));
112        };
113
114        self.known_field_names.insert(field_name, field_id);
115        Ok(field_id)
116    }
117    fn field_name(&self, field_id: usize) -> &str {
118        &self.metadata[field_id]
119    }
120    fn num_field_names(&self) -> usize {
121        self.metadata.len()
122    }
123    fn truncate_field_names(&mut self, new_size: usize) {
124        debug_assert_eq!(self.metadata.len(), new_size);
125    }
126    fn finish(&mut self) -> usize {
127        self.metadata.bytes.len()
128    }
129}
130
131/// Builder for constructing metadata for [`Variant`] values.
132///
133/// This is used internally by the [`VariantBuilder`] to construct the metadata
134///
135/// You can use an existing `Vec<u8>` as the metadata buffer by using the `from` impl.
136///
137/// [`Variant`]: crate::Variant
138/// [`VariantBuilder`]: crate::VariantBuilder
139#[derive(Default, Debug)]
140pub struct WritableMetadataBuilder {
141    pub(crate) field_names: IndexSet<String>,
142
143    pub(crate) is_sorted: bool,
144
145    /// Output buffer. Metadata is written to the end of this buffer
146    metadata_buffer: Vec<u8>,
147}
148
149impl WritableMetadataBuilder {
150    /// Upsert field name to dictionary, return its ID
151    pub fn upsert_field_name(&mut self, field_name: &str) -> u32 {
152        let (id, new_entry) = self.field_names.insert_full(field_name.to_string());
153
154        if new_entry {
155            let n = self.num_field_names();
156
157            // Dictionary sort order tracking:
158            // - An empty dictionary is unsorted (ambiguous in spec but required by interop tests)
159            // - A single-entry dictionary is trivially sorted
160            // - Otherwise, an already-sorted dictionary becomes unsorted if the new entry breaks order
161            self.is_sorted =
162                n == 1 || self.is_sorted && (self.field_names[n - 2] < self.field_names[n - 1]);
163        }
164
165        id as u32
166    }
167
168    /// The current length of the underlying metadata buffer
169    pub fn offset(&self) -> usize {
170        self.metadata_buffer.len()
171    }
172
173    /// Returns the number of field names stored in the metadata builder.
174    /// Note: this method should be the only place to call `self.field_names.len()`
175    ///
176    /// # Panics
177    ///
178    /// If the number of field names exceeds the maximum allowed value for `u32`.
179    fn num_field_names(&self) -> usize {
180        let n = self.field_names.len();
181        assert!(n <= u32::MAX as usize);
182
183        n
184    }
185
186    fn field_name(&self, i: usize) -> &str {
187        &self.field_names[i]
188    }
189
190    fn metadata_size(&self) -> usize {
191        self.field_names.iter().map(|k| k.len()).sum()
192    }
193
194    /// Finalizes the metadata dictionary and appends its serialized bytes to the underlying buffer,
195    /// returning the resulting [`Self::offset`]. The builder state is reset and ready to start
196    /// building a new metadata dictionary.
197    pub fn finish(&mut self) -> usize {
198        let nkeys = self.num_field_names();
199
200        // Calculate metadata size
201        let total_dict_size: usize = self.metadata_size();
202
203        let metadata_buffer = &mut self.metadata_buffer;
204        let is_sorted = std::mem::take(&mut self.is_sorted);
205        let field_names = std::mem::take(&mut self.field_names);
206
207        // Determine appropriate offset size based on the larger of dict size or total string size
208        let max_offset = std::cmp::max(total_dict_size, nkeys);
209        let offset_size = int_size(max_offset);
210
211        let offset_start = 1 + offset_size as usize;
212        let string_start = offset_start + (nkeys + 1) * offset_size as usize;
213        let metadata_size = string_start + total_dict_size;
214
215        metadata_buffer.reserve(metadata_size);
216
217        // Write header: version=1, field names are sorted, with calculated offset_size
218        metadata_buffer.push(0x01 | (is_sorted as u8) << 4 | ((offset_size - 1) << 6));
219
220        // Write dictionary size
221        write_offset(metadata_buffer, nkeys, offset_size);
222
223        // Write offsets
224        let mut cur_offset = 0;
225        for key in field_names.iter() {
226            write_offset(metadata_buffer, cur_offset, offset_size);
227            cur_offset += key.len();
228        }
229        // Write final offset
230        write_offset(metadata_buffer, cur_offset, offset_size);
231
232        // Write string data
233        for key in field_names {
234            metadata_buffer.extend_from_slice(key.as_bytes());
235        }
236
237        metadata_buffer.len()
238    }
239
240    /// Returns the inner buffer, consuming self without finalizing any in progress metadata.
241    pub fn into_inner(self) -> Vec<u8> {
242        self.metadata_buffer
243    }
244}
245
246impl<S: AsRef<str>> FromIterator<S> for WritableMetadataBuilder {
247    fn from_iter<T: IntoIterator<Item = S>>(iter: T) -> Self {
248        let mut this = Self::default();
249        this.extend(iter);
250
251        this
252    }
253}
254
255impl<S: AsRef<str>> Extend<S> for WritableMetadataBuilder {
256    fn extend<T: IntoIterator<Item = S>>(&mut self, iter: T) {
257        let iter = iter.into_iter();
258        let (min, _) = iter.size_hint();
259
260        self.field_names.reserve(min);
261
262        for field_name in iter {
263            self.upsert_field_name(field_name.as_ref());
264        }
265    }
266}
267
268#[cfg(test)]
269mod test {
270    use crate::{
271        ParentState, ValueBuilder, Variant, VariantBuilder, VariantMetadata,
272        builder::{
273            metadata::{ReadOnlyMetadataBuilder, WritableMetadataBuilder},
274            object::ObjectBuilder,
275        },
276    };
277
278    #[test]
279    fn test_metadata_builder_from_iter() {
280        let metadata = WritableMetadataBuilder::from_iter(vec!["apple", "banana", "cherry"]);
281        assert_eq!(metadata.num_field_names(), 3);
282        assert_eq!(metadata.field_name(0), "apple");
283        assert_eq!(metadata.field_name(1), "banana");
284        assert_eq!(metadata.field_name(2), "cherry");
285        assert!(metadata.is_sorted);
286
287        let metadata = WritableMetadataBuilder::from_iter(["zebra", "apple", "banana"]);
288        assert_eq!(metadata.num_field_names(), 3);
289        assert_eq!(metadata.field_name(0), "zebra");
290        assert_eq!(metadata.field_name(1), "apple");
291        assert_eq!(metadata.field_name(2), "banana");
292        assert!(!metadata.is_sorted);
293
294        let metadata = WritableMetadataBuilder::from_iter(Vec::<&str>::new());
295        assert_eq!(metadata.num_field_names(), 0);
296        assert!(!metadata.is_sorted);
297    }
298
299    #[test]
300    fn test_metadata_builder_extend() {
301        let mut metadata = WritableMetadataBuilder::default();
302        assert_eq!(metadata.num_field_names(), 0);
303        assert!(!metadata.is_sorted);
304
305        metadata.extend(["apple", "cherry"]);
306        assert_eq!(metadata.num_field_names(), 2);
307        assert_eq!(metadata.field_name(0), "apple");
308        assert_eq!(metadata.field_name(1), "cherry");
309        assert!(metadata.is_sorted);
310
311        // extend with more field names that maintain sort order
312        metadata.extend(vec!["dinosaur", "monkey"]);
313        assert_eq!(metadata.num_field_names(), 4);
314        assert_eq!(metadata.field_name(2), "dinosaur");
315        assert_eq!(metadata.field_name(3), "monkey");
316        assert!(metadata.is_sorted);
317
318        // test extending with duplicate field names
319        let initial_count = metadata.num_field_names();
320        metadata.extend(["apple", "monkey"]);
321        assert_eq!(metadata.num_field_names(), initial_count); // No new fields added
322    }
323
324    #[test]
325    fn test_metadata_builder_extend_sort_order() {
326        let mut metadata = WritableMetadataBuilder::default();
327
328        metadata.extend(["middle"]);
329        assert!(metadata.is_sorted);
330
331        metadata.extend(["zebra"]);
332        assert!(metadata.is_sorted);
333
334        // add field that breaks sort order
335        metadata.extend(["apple"]);
336        assert!(!metadata.is_sorted);
337    }
338
339    #[test]
340    fn test_metadata_builder_from_iter_with_string_types() {
341        // &str
342        let metadata = WritableMetadataBuilder::from_iter(["a", "b", "c"]);
343        assert_eq!(metadata.num_field_names(), 3);
344
345        // string
346        let metadata = WritableMetadataBuilder::from_iter(vec![
347            "a".to_string(),
348            "b".to_string(),
349            "c".to_string(),
350        ]);
351        assert_eq!(metadata.num_field_names(), 3);
352
353        // mixed types (anything that implements AsRef<str>)
354        let field_names: Vec<Box<str>> = vec!["a".into(), "b".into(), "c".into()];
355        let metadata = WritableMetadataBuilder::from_iter(field_names);
356        assert_eq!(metadata.num_field_names(), 3);
357    }
358
359    #[test]
360    fn test_read_only_metadata_builder() {
361        // First create some metadata with a few field names
362        let mut default_builder = VariantBuilder::new();
363        default_builder.add_field_name("name");
364        default_builder.add_field_name("age");
365        default_builder.add_field_name("active");
366        let (metadata_bytes, _) = default_builder.finish();
367
368        // Use the metadata to build new variant values
369        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
370        let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
371        let mut value_builder = ValueBuilder::new();
372
373        {
374            let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
375            let mut obj = ObjectBuilder::new(state, false);
376
377            // These should succeed because the fields exist in the metadata
378            obj.insert("name", "Alice");
379            obj.insert("age", 30i8);
380            obj.insert("active", true);
381            obj.finish();
382        }
383
384        let value = value_builder.into_inner();
385
386        // Verify the variant was built correctly
387        let variant = Variant::try_new(&metadata_bytes, &value).unwrap();
388        let obj = variant.as_object().unwrap();
389        assert_eq!(obj.get("name"), Some(Variant::from("Alice")));
390        assert_eq!(obj.get("age"), Some(Variant::Int8(30)));
391        assert_eq!(obj.get("active"), Some(Variant::from(true)));
392    }
393
394    #[test]
395    fn test_read_only_metadata_builder_fails_on_unknown_field() {
396        // Create metadata with only one field
397        let mut default_builder = VariantBuilder::new();
398        default_builder.add_field_name("known_field");
399        let (metadata_bytes, _) = default_builder.finish();
400
401        // Use the metadata to build new variant values
402        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
403        let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
404        let mut value_builder = ValueBuilder::new();
405
406        {
407            let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
408            let mut obj = ObjectBuilder::new(state, false);
409
410            // This should succeed
411            obj.insert("known_field", "value");
412
413            // This should fail because "unknown_field" is not in the metadata
414            let result = obj.try_insert("unknown_field", "value");
415            assert!(result.is_err());
416            assert!(
417                result
418                    .unwrap_err()
419                    .to_string()
420                    .contains("Field name 'unknown_field' not found")
421            );
422        }
423    }
424}