parquet_variant/variant/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{
20    first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21    try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29    version: u8,
30    is_sorted: bool,
31    /// Note: This is `offset_size_minus_one` + 1
32    offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43    // Hide the cast
44    const fn offset_size(&self) -> u32 {
45        self.offset_size as u32
46    }
47
48    // Avoid materializing this offset, since it's cheaply and safely computable
49    const fn first_offset_byte(&self) -> u32 {
50        NUM_HEADER_BYTES + self.offset_size()
51    }
52
53    /// Tries to construct the variant metadata header, which has the form
54    ///
55    /// ```text
56    ///              7     6  5   4  3             0
57    ///             +-------+---+---+---------------+
58    /// header      |       |   |   |    version    |
59    ///             +-------+---+---+---------------+
60    ///                 ^         ^
61    ///                 |         +-- sorted_strings
62    ///                 +-- offset_size_minus_one
63    /// ```
64    ///
65    /// The version is a 4-bit value that must always contain the value 1.
66    /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67    /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70        let version = header_byte & 0x0F; // First four bits
71        if version != CORRECT_VERSION_VALUE {
72            let err_msg = format!(
73                "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74            );
75            return Err(ArrowError::InvalidArgumentError(err_msg));
76        }
77        let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78        let offset_size_minus_one = header_byte >> 6; // Last two bits
79        Ok(Self {
80            version,
81            is_sorted,
82            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83        })
84    }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133    /// (Only) the bytes that make up this metadata instance.
134    pub(crate) bytes: &'m [u8],
135    header: VariantMetadataHeader,
136    dictionary_size: u32,
137    first_value_byte: u32,
138    validated: bool,
139}
140
141// We don't want this to grow because it increases the size of VariantList and VariantObject, which
142// could increase the size of Variant. All those size increases could hurt performance.
143const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
144
145/// The canonical byte slice corresponding to an empty metadata dictionary.
146///
147/// ```
148/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
149/// let mut metadata_builder = WritableMetadataBuilder::default();
150/// metadata_builder.finish();
151/// let metadata_bytes = metadata_builder.into_inner();
152/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
153/// ```
154pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];
155
156/// The empty metadata dictionary.
157///
158/// ```
159/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
160/// let mut metadata_builder = WritableMetadataBuilder::default();
161/// metadata_builder.finish();
162/// let metadata_bytes = metadata_builder.into_inner();
163/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
164/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
165/// ```
166pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
167    bytes: EMPTY_VARIANT_METADATA_BYTES,
168    header: VariantMetadataHeader {
169        version: CORRECT_VERSION_VALUE,
170        is_sorted: false,
171        offset_size: OffsetSizeBytes::One,
172    },
173    dictionary_size: 0,
174    first_value_byte: 3,
175    validated: true,
176};
177
178impl<'m> VariantMetadata<'m> {
179    /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
180    /// dictionary entries.
181    ///
182    /// [validation]: Self#Validation
183    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
184        Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
185    }
186
187    /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
188    /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
189    /// indexing and iteration could also panic if the underlying bytes are invalid.
190    ///
191    /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
192    /// were already validated previously by other means, or if the caller expects a small number of
193    /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
194    /// needed, instead of paying expensive full validation up front).
195    ///
196    /// [validate]: Self#Validation
197    pub fn new(bytes: &'m [u8]) -> Self {
198        Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
199    }
200
201    // The actual constructor, which performs only basic (constant-const) validation.
202    pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
203        let header_byte = first_byte_from_slice(bytes)?;
204        let header = VariantMetadataHeader::try_new(header_byte)?;
205
206        // First element after header is dictionary size; the offset array immediately follows.
207        let dictionary_size =
208            header
209                .offset_size
210                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
211
212        // Calculate the starting offset of the dictionary string bytes.
213        //
214        // There are dict_size + 1 offsets, and the value bytes immediately follow
215        // = (dict_size + 1) * offset_size + header.first_offset_byte()
216        let first_value_byte = dictionary_size
217            .checked_add(1)
218            .and_then(|n| n.checked_mul(header.offset_size()))
219            .and_then(|n| n.checked_add(header.first_offset_byte()))
220            .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
221
222        let mut new_self = Self {
223            bytes,
224            header,
225            dictionary_size,
226            first_value_byte,
227            validated: false,
228        };
229
230        // Validate just the first and last offset, ignoring the other offsets and all value bytes.
231        let first_offset = new_self.get_offset(0)?;
232        if first_offset != 0 {
233            return Err(ArrowError::InvalidArgumentError(format!(
234                "First offset is not zero: {first_offset}"
235            )));
236        }
237
238        // Use the last offset to upper-bound the byte slice
239        let last_offset = new_self
240            .get_offset(dictionary_size as _)?
241            .checked_add(first_value_byte)
242            .ok_or_else(|| overflow_error("variant metadata size"))?;
243        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
244        Ok(new_self)
245    }
246
247    /// The number of metadata dictionary entries
248    pub fn len(&self) -> usize {
249        self.dictionary_size as _
250    }
251
252    /// True if this metadata dictionary contains no entries
253    pub fn is_empty(&self) -> bool {
254        self.len() == 0
255    }
256
257    /// True if this instance is fully [validated] for panic-free infallible accesses.
258    ///
259    /// [validated]: Self#Validation
260    pub fn is_fully_validated(&self) -> bool {
261        self.validated
262    }
263
264    /// Performs a full [validation] of this metadata dictionary and returns the result.
265    ///
266    /// [validation]: Self#Validation
267    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
268        if !self.validated {
269            let offset_bytes = slice_from_slice(
270                self.bytes,
271                self.header.first_offset_byte() as _..self.first_value_byte as _,
272            )?;
273
274            // Verify the string values in the dictionary are UTF-8 encoded strings.
275            let value_buffer =
276                string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
277
278            let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
279
280            if self.header.is_sorted {
281                // Validate the dictionary values are unique and lexicographically sorted
282                //
283                // Since we use the offsets to access dictionary values, this also validates
284                // offsets are in-bounds and monotonically increasing
285                let mut current_offset = offsets.next().unwrap_or(0);
286                let mut prev_value: Option<&str> = None;
287                for next_offset in offsets {
288                    let current_value =
289                        value_buffer
290                            .get(current_offset..next_offset)
291                            .ok_or_else(|| {
292                                ArrowError::InvalidArgumentError(format!(
293                                "range {current_offset}..{next_offset} is invalid or out of bounds"
294                            ))
295                            })?;
296
297                    if let Some(prev_val) = prev_value {
298                        if current_value <= prev_val {
299                            return Err(ArrowError::InvalidArgumentError(
300                                "dictionary values are not unique and ordered".to_string(),
301                            ));
302                        }
303                    }
304
305                    prev_value = Some(current_value);
306                    current_offset = next_offset;
307                }
308            } else {
309                // Validate offsets are in-bounds and monotonically increasing
310                //
311                // Since shallow validation ensures the first and last offsets are in bounds,
312                // we can also verify all offsets are in-bounds by checking if
313                // offsets are monotonically increasing
314                if !offsets.is_sorted_by(|a, b| a < b) {
315                    return Err(ArrowError::InvalidArgumentError(
316                        "offsets not monotonically increasing".to_string(),
317                    ));
318                }
319            }
320
321            self.validated = true;
322        }
323        Ok(self)
324    }
325
326    /// Whether the dictionary keys are sorted and unique
327    pub fn is_sorted(&self) -> bool {
328        self.header.is_sorted
329    }
330
331    /// The variant protocol version
332    pub const fn version(&self) -> u8 {
333        self.header.version
334    }
335
336    /// Gets an offset into the dictionary entry by index.
337    ///
338    /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
339    /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
340    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
341        let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
342        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
343        self.header.offset_size.unpack_u32(bytes, i)
344    }
345
346    /// Returns the total size, in bytes, of the metadata.
347    ///
348    /// Note this value may be smaller than what was passed to [`Self::new`] or
349    /// [`Self::try_new`] if the input was larger than necessary to encode the
350    /// metadata dictionary.
351    pub fn size(&self) -> usize {
352        self.bytes.len()
353    }
354
355    /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
356    /// underlying bytes are [invalid].
357    ///
358    /// [invalid]: Self#Validation
359    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
360        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
361        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
362    }
363
364    // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
365    // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
366    // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
367    fn get_impl(&self, i: usize) -> &'m str {
368        self.get(i).expect("Invalid metadata dictionary entry")
369    }
370
371    /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
372    /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
373    /// otherwise.
374    ///
375    /// WARNING: This method panics if the underlying bytes are [invalid].
376    ///
377    /// [invalid]: Self#Validation
378    pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
379        let field_id = if self.is_sorted() && self.len() > 10 {
380            // Binary search is faster for a not-tiny sorted metadata dictionary
381            let cmp = |i| Some(self.get_impl(i).cmp(field_name));
382            try_binary_search_range_by(0..self.len(), cmp)?.ok()?
383        } else {
384            // Fall back to Linear search for tiny or unsorted dictionary
385            (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
386        };
387        Some((field_id as u32, self.get_impl(field_id)))
388    }
389
390    /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
391    /// iterator encounters [invalid] data.
392    ///
393    /// [invalid]: Self#Validation
394    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
395        (0..self.len()).map(|i| self.get(i))
396    }
397
398    /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
399    /// [`Self::iter_try`] to avoid panics due to invalid data.
400    ///
401    /// [unvalidated]: Self#Validation
402    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
403        self.iter_try()
404            .map(|result| result.expect("Invalid metadata dictionary entry"))
405    }
406}
407
408/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
409/// [unvalidated] input could also panic if the underlying bytes are invalid.
410///
411/// [unvalidated]: Self#Validation
412impl std::ops::Index<usize> for VariantMetadata<'_> {
413    type Output = str;
414
415    fn index(&self, i: usize) -> &str {
416        self.get_impl(i)
417    }
418}
419
420#[cfg(test)]
421mod tests {
422
423    use crate::VariantBuilder;
424
425    use super::*;
426
427    /// `"cat"`, `"dog"` – valid metadata
428    #[test]
429    fn try_new_ok_inline() {
430        let bytes = &[
431            0b0000_0001, // header, offset_size_minus_one=0 and version=1
432            0x02,        // dictionary_size (2 strings)
433            0x00,
434            0x03,
435            0x06,
436            b'c',
437            b'a',
438            b't',
439            b'd',
440            b'o',
441            b'g',
442        ];
443
444        let md = VariantMetadata::try_new(bytes).expect("should parse");
445        assert_eq!(md.len(), 2);
446        // Fields
447        assert_eq!(&md[0], "cat");
448        assert_eq!(&md[1], "dog");
449
450        // Offsets
451        assert_eq!(md.get_offset(0).unwrap(), 0x00);
452        assert_eq!(md.get_offset(1).unwrap(), 0x03);
453        assert_eq!(md.get_offset(2).unwrap(), 0x06);
454
455        let err = md.get_offset(3).unwrap_err();
456        assert!(
457            matches!(err, ArrowError::InvalidArgumentError(_)),
458            "unexpected error: {err:?}"
459        );
460
461        let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
462        assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
463    }
464
465    /// Too short buffer test (missing one required offset).
466    /// Should error with "metadata shorter than dictionary_size implies".
467    #[test]
468    fn try_new_missing_last_value() {
469        let bytes = &[
470            0b0000_0001, // header, offset_size_minus_one=0 and version=1
471            0x02,        // dictionary_size = 2
472            0x00,
473            0x01,
474            0x02,
475            b'a',
476            b'b', // <-- we'll remove this
477        ];
478
479        let working_md = VariantMetadata::try_new(bytes).expect("should parse");
480        assert_eq!(working_md.len(), 2);
481        assert_eq!(&working_md[0], "a");
482        assert_eq!(&working_md[1], "b");
483
484        let truncated = &bytes[..bytes.len() - 1];
485
486        let err = VariantMetadata::try_new(truncated).unwrap_err();
487        assert!(
488            matches!(err, ArrowError::InvalidArgumentError(_)),
489            "unexpected error: {err:?}"
490        );
491    }
492
493    #[test]
494    fn try_new_fails_non_monotonic() {
495        // 'cat', 'dog', 'lamb'
496        let bytes = &[
497            0b0000_0001, // header, offset_size_minus_one=0 and version=1
498            0x03,        // dictionary_size
499            0x00,
500            0x02,
501            0x01, // Doesn't increase monotonically
502            0x10,
503            b'c',
504            b'a',
505            b't',
506            b'd',
507            b'o',
508            b'g',
509            b'l',
510            b'a',
511            b'm',
512            b'b',
513        ];
514
515        let err = VariantMetadata::try_new(bytes).unwrap_err();
516        assert!(
517            matches!(err, ArrowError::InvalidArgumentError(_)),
518            "unexpected error: {err:?}"
519        );
520    }
521
522    #[test]
523    fn try_new_fails_non_monotonic2() {
524        // this test case checks whether offsets are monotonic in the full validation logic.
525
526        // 'cat', 'dog', 'lamb', "eel"
527        let bytes = &[
528            0b0000_0001, // header, offset_size_minus_one=0 and version=1
529            4,           // dictionary_size
530            0x00,
531            0x02,
532            0x01, // Doesn't increase monotonically
533            0x10,
534            13,
535            b'c',
536            b'a',
537            b't',
538            b'd',
539            b'o',
540            b'g',
541            b'l',
542            b'a',
543            b'm',
544            b'b',
545            b'e',
546            b'e',
547            b'l',
548        ];
549
550        let err = VariantMetadata::try_new(bytes).unwrap_err();
551
552        assert!(
553            matches!(err, ArrowError::InvalidArgumentError(_)),
554            "unexpected error: {err:?}"
555        );
556    }
557
558    #[test]
559    fn try_new_truncated_offsets_inline() {
560        // Missing final offset
561        let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
562
563        let err = VariantMetadata::try_new(bytes).unwrap_err();
564        assert!(
565            matches!(err, ArrowError::InvalidArgumentError(_)),
566            "unexpected error: {err:?}"
567        );
568    }
569
570    #[test]
571    fn empty_string_is_valid() {
572        let bytes = &[
573            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
574            1,
575            0x00,
576            0x00,
577        ];
578        let metadata = VariantMetadata::try_new(bytes).unwrap();
579        assert_eq!(&metadata[0], "");
580
581        let bytes = &[
582            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
583            2,
584            0x00,
585            0x00,
586            0x02,
587            b'h',
588            b'i',
589        ];
590        let metadata = VariantMetadata::try_new(bytes).unwrap();
591        assert_eq!(&metadata[0], "");
592        assert_eq!(&metadata[1], "hi");
593
594        let bytes = &[
595            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
596            2,
597            0x00,
598            0x02,
599            0x02, // empty string is allowed, but must be first in a sorted dict
600            b'h',
601            b'i',
602        ];
603        let err = VariantMetadata::try_new(bytes).unwrap_err();
604        assert!(
605            matches!(err, ArrowError::InvalidArgumentError(_)),
606            "unexpected error: {err:?}"
607        );
608    }
609
610    #[test]
611    fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
612        // create a sorted object
613        let mut b = VariantBuilder::new();
614        let mut o = b.new_object();
615
616        o.insert("a", false);
617        o.insert("b", false);
618
619        o.finish();
620
621        let (m, _) = b.finish();
622
623        let m1 = VariantMetadata::new(&m);
624        assert!(m1.is_sorted());
625
626        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
627        // Since field names are not unique, it is considered not sorted.
628        let metadata_bytes = vec![
629            0b0000_0001,
630            3, // dictionary size
631            0, // "a"
632            1, // "a"
633            2, // "b"
634            3,
635            b'a',
636            b'a',
637            b'b',
638        ];
639        let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
640        assert!(!m2.is_sorted());
641
642        assert_ne!(m1, m2);
643    }
644
645    #[test]
646    fn test_compare_sorted_dictionary_with_sorted_dictionary() {
647        // create a sorted object
648        let mut b = VariantBuilder::new();
649        let mut o = b.new_object();
650
651        o.insert("a", false);
652        o.insert("b", false);
653
654        o.finish();
655
656        let (m, _) = b.finish();
657
658        let m1 = VariantMetadata::new(&m);
659        let m2 = VariantMetadata::new(&m);
660
661        assert_eq!(m1, m2);
662    }
663}