parquet_variant/variant/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{OffsetSizeBytes, map_bytes_to_offsets};
19use crate::utils::{
20    first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21    try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29    version: u8,
30    is_sorted: bool,
31    /// Note: This is `offset_size_minus_one` + 1
32    offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43    // Hide the cast
44    const fn offset_size(&self) -> u32 {
45        self.offset_size as u32
46    }
47
48    // Avoid materializing this offset, since it's cheaply and safely computable
49    const fn first_offset_byte(&self) -> u32 {
50        NUM_HEADER_BYTES + self.offset_size()
51    }
52
53    /// Tries to construct the variant metadata header, which has the form
54    ///
55    /// ```text
56    ///              7     6  5   4  3             0
57    ///             +-------+---+---+---------------+
58    /// header      |       |   |   |    version    |
59    ///             +-------+---+---+---------------+
60    ///                 ^         ^
61    ///                 |         +-- sorted_strings
62    ///                 +-- offset_size_minus_one
63    /// ```
64    ///
65    /// The version is a 4-bit value that must always contain the value 1.
66    /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67    /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70        let version = header_byte & 0x0F; // First four bits
71        if version != CORRECT_VERSION_VALUE {
72            let err_msg = format!(
73                "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74            );
75            return Err(ArrowError::InvalidArgumentError(err_msg));
76        }
77        let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78        let offset_size_minus_one = header_byte >> 6; // Last two bits
79        Ok(Self {
80            version,
81            is_sorted,
82            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83        })
84    }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133    /// (Only) the bytes that make up this metadata instance.
134    pub(crate) bytes: &'m [u8],
135    header: VariantMetadataHeader,
136    dictionary_size: u32,
137    first_value_byte: u32,
138    validated: bool,
139}
140
141// We don't want this to grow because it increases the size of VariantList and VariantObject, which
142// could increase the size of Variant. All those size increases could hurt performance.
143const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
144
145/// The canonical byte slice corresponding to an empty metadata dictionary.
146///
147/// ```
148/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
149/// let mut metadata_builder = WritableMetadataBuilder::default();
150/// metadata_builder.finish();
151/// let metadata_bytes = metadata_builder.into_inner();
152/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
153/// ```
154pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];
155
156/// The empty metadata dictionary.
157///
158/// ```
159/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
160/// let mut metadata_builder = WritableMetadataBuilder::default();
161/// metadata_builder.finish();
162/// let metadata_bytes = metadata_builder.into_inner();
163/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
164/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
165/// ```
166pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
167    bytes: EMPTY_VARIANT_METADATA_BYTES,
168    header: VariantMetadataHeader {
169        version: CORRECT_VERSION_VALUE,
170        is_sorted: false,
171        offset_size: OffsetSizeBytes::One,
172    },
173    dictionary_size: 0,
174    first_value_byte: 3,
175    validated: true,
176};
177
178impl<'m> VariantMetadata<'m> {
179    /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
180    /// dictionary entries.
181    ///
182    /// [validation]: Self#Validation
183    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
184        Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
185    }
186
187    /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
188    /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
189    /// indexing and iteration could also panic if the underlying bytes are invalid.
190    ///
191    /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
192    /// were already validated previously by other means, or if the caller expects a small number of
193    /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
194    /// needed, instead of paying expensive full validation up front).
195    ///
196    /// [validate]: Self#Validation
197    pub fn new(bytes: &'m [u8]) -> Self {
198        Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
199    }
200
201    // The actual constructor, which performs only basic (constant-const) validation.
202    pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
203        let header_byte = first_byte_from_slice(bytes)?;
204        let header = VariantMetadataHeader::try_new(header_byte)?;
205
206        // First element after header is dictionary size; the offset array immediately follows.
207        let dictionary_size =
208            header
209                .offset_size
210                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
211
212        // Calculate the starting offset of the dictionary string bytes.
213        //
214        // There are dict_size + 1 offsets, and the value bytes immediately follow
215        // = (dict_size + 1) * offset_size + header.first_offset_byte()
216        let first_value_byte = dictionary_size
217            .checked_add(1)
218            .and_then(|n| n.checked_mul(header.offset_size()))
219            .and_then(|n| n.checked_add(header.first_offset_byte()))
220            .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
221
222        let mut new_self = Self {
223            bytes,
224            header,
225            dictionary_size,
226            first_value_byte,
227            validated: false,
228        };
229
230        // Validate just the first and last offset, ignoring the other offsets and all value bytes.
231        let first_offset = new_self.get_offset(0)?;
232        if first_offset != 0 {
233            return Err(ArrowError::InvalidArgumentError(format!(
234                "First offset is not zero: {first_offset}"
235            )));
236        }
237
238        // Use the last offset to upper-bound the byte slice
239        let last_offset = new_self
240            .get_offset(dictionary_size as _)?
241            .checked_add(first_value_byte)
242            .ok_or_else(|| overflow_error("variant metadata size"))?;
243        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
244        Ok(new_self)
245    }
246
247    /// The number of metadata dictionary entries
248    pub fn len(&self) -> usize {
249        self.dictionary_size as _
250    }
251
252    /// True if this metadata dictionary contains no entries
253    pub fn is_empty(&self) -> bool {
254        self.len() == 0
255    }
256
257    /// True if this instance is fully [validated] for panic-free infallible accesses.
258    ///
259    /// [validated]: Self#Validation
260    pub fn is_fully_validated(&self) -> bool {
261        self.validated
262    }
263
264    /// Performs a full [validation] of this metadata dictionary and returns the result.
265    ///
266    /// [validation]: Self#Validation
267    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
268        if !self.validated {
269            let offset_bytes = slice_from_slice(
270                self.bytes,
271                self.header.first_offset_byte() as _..self.first_value_byte as _,
272            )?;
273
274            // Verify the string values in the dictionary are UTF-8 encoded strings.
275            let value_buffer =
276                string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
277
278            let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
279
280            if self.header.is_sorted {
281                // Validate the dictionary values are unique and lexicographically sorted
282                //
283                // Since we use the offsets to access dictionary values, this also validates
284                // offsets are in-bounds and monotonically increasing
285                let mut current_offset = offsets.next().unwrap_or(0);
286                let mut prev_value: Option<&str> = None;
287                for next_offset in offsets {
288                    let current_value = value_buffer.get(current_offset..next_offset).ok_or_else(
289                        || {
290                            ArrowError::InvalidArgumentError(format!(
291                                "range {current_offset}..{next_offset} is invalid or out of bounds"
292                            ))
293                        },
294                    )?;
295
296                    if let Some(prev_val) = prev_value {
297                        if current_value <= prev_val {
298                            return Err(ArrowError::InvalidArgumentError(
299                                "dictionary values are not unique and ordered".to_string(),
300                            ));
301                        }
302                    }
303
304                    prev_value = Some(current_value);
305                    current_offset = next_offset;
306                }
307            } else {
308                // Validate offsets are in-bounds and monotonically increasing
309                //
310                // Since shallow validation ensures the first and last offsets are in bounds,
311                // we can also verify all offsets are in-bounds by checking if
312                // offsets are monotonically increasing
313                if !offsets.is_sorted_by(|a, b| a < b) {
314                    return Err(ArrowError::InvalidArgumentError(
315                        "offsets not monotonically increasing".to_string(),
316                    ));
317                }
318            }
319
320            self.validated = true;
321        }
322        Ok(self)
323    }
324
325    /// Whether the dictionary keys are sorted and unique
326    pub fn is_sorted(&self) -> bool {
327        self.header.is_sorted
328    }
329
330    /// The variant protocol version
331    pub const fn version(&self) -> u8 {
332        self.header.version
333    }
334
335    /// Gets an offset into the dictionary entry by index.
336    ///
337    /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
338    /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
339    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
340        let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
341        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
342        self.header.offset_size.unpack_u32(bytes, i)
343    }
344
345    /// Returns the total size, in bytes, of the metadata.
346    ///
347    /// Note this value may be smaller than what was passed to [`Self::new`] or
348    /// [`Self::try_new`] if the input was larger than necessary to encode the
349    /// metadata dictionary.
350    pub fn size(&self) -> usize {
351        self.bytes.len()
352    }
353
354    /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
355    /// underlying bytes are [invalid].
356    ///
357    /// [invalid]: Self#Validation
358    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
359        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
360        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
361    }
362
363    // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
364    // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
365    // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
366    fn get_impl(&self, i: usize) -> &'m str {
367        self.get(i).expect("Invalid metadata dictionary entry")
368    }
369
370    /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
371    /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
372    /// otherwise.
373    ///
374    /// WARNING: This method panics if the underlying bytes are [invalid].
375    ///
376    /// [invalid]: Self#Validation
377    pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
378        let field_id = if self.is_sorted() && self.len() > 10 {
379            // Binary search is faster for a not-tiny sorted metadata dictionary
380            let cmp = |i| Some(self.get_impl(i).cmp(field_name));
381            try_binary_search_range_by(0..self.len(), cmp)?.ok()?
382        } else {
383            // Fall back to Linear search for tiny or unsorted dictionary
384            (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
385        };
386        Some((field_id as u32, self.get_impl(field_id)))
387    }
388
389    /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
390    /// iterator encounters [invalid] data.
391    ///
392    /// [invalid]: Self#Validation
393    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
394        (0..self.len()).map(|i| self.get(i))
395    }
396
397    /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
398    /// [`Self::iter_try`] to avoid panics due to invalid data.
399    ///
400    /// [unvalidated]: Self#Validation
401    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
402        self.iter_try()
403            .map(|result| result.expect("Invalid metadata dictionary entry"))
404    }
405}
406
407/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
408/// [unvalidated] input could also panic if the underlying bytes are invalid.
409///
410/// [unvalidated]: Self#Validation
411impl std::ops::Index<usize> for VariantMetadata<'_> {
412    type Output = str;
413
414    fn index(&self, i: usize) -> &str {
415        self.get_impl(i)
416    }
417}
418
419#[cfg(test)]
420mod tests {
421
422    use crate::VariantBuilder;
423
424    use super::*;
425
426    /// `"cat"`, `"dog"` – valid metadata
427    #[test]
428    fn try_new_ok_inline() {
429        let bytes = &[
430            0b0000_0001, // header, offset_size_minus_one=0 and version=1
431            0x02,        // dictionary_size (2 strings)
432            0x00,
433            0x03,
434            0x06,
435            b'c',
436            b'a',
437            b't',
438            b'd',
439            b'o',
440            b'g',
441        ];
442
443        let md = VariantMetadata::try_new(bytes).expect("should parse");
444        assert_eq!(md.len(), 2);
445        // Fields
446        assert_eq!(&md[0], "cat");
447        assert_eq!(&md[1], "dog");
448
449        // Offsets
450        assert_eq!(md.get_offset(0).unwrap(), 0x00);
451        assert_eq!(md.get_offset(1).unwrap(), 0x03);
452        assert_eq!(md.get_offset(2).unwrap(), 0x06);
453
454        let err = md.get_offset(3).unwrap_err();
455        assert!(
456            matches!(err, ArrowError::InvalidArgumentError(_)),
457            "unexpected error: {err:?}"
458        );
459
460        let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
461        assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
462    }
463
464    /// Too short buffer test (missing one required offset).
465    /// Should error with "metadata shorter than dictionary_size implies".
466    #[test]
467    fn try_new_missing_last_value() {
468        let bytes = &[
469            0b0000_0001, // header, offset_size_minus_one=0 and version=1
470            0x02,        // dictionary_size = 2
471            0x00,
472            0x01,
473            0x02,
474            b'a',
475            b'b', // <-- we'll remove this
476        ];
477
478        let working_md = VariantMetadata::try_new(bytes).expect("should parse");
479        assert_eq!(working_md.len(), 2);
480        assert_eq!(&working_md[0], "a");
481        assert_eq!(&working_md[1], "b");
482
483        let truncated = &bytes[..bytes.len() - 1];
484
485        let err = VariantMetadata::try_new(truncated).unwrap_err();
486        assert!(
487            matches!(err, ArrowError::InvalidArgumentError(_)),
488            "unexpected error: {err:?}"
489        );
490    }
491
492    #[test]
493    fn try_new_fails_non_monotonic() {
494        // 'cat', 'dog', 'lamb'
495        let bytes = &[
496            0b0000_0001, // header, offset_size_minus_one=0 and version=1
497            0x03,        // dictionary_size
498            0x00,
499            0x02,
500            0x01, // Doesn't increase monotonically
501            0x10,
502            b'c',
503            b'a',
504            b't',
505            b'd',
506            b'o',
507            b'g',
508            b'l',
509            b'a',
510            b'm',
511            b'b',
512        ];
513
514        let err = VariantMetadata::try_new(bytes).unwrap_err();
515        assert!(
516            matches!(err, ArrowError::InvalidArgumentError(_)),
517            "unexpected error: {err:?}"
518        );
519    }
520
521    #[test]
522    fn try_new_fails_non_monotonic2() {
523        // this test case checks whether offsets are monotonic in the full validation logic.
524
525        // 'cat', 'dog', 'lamb', "eel"
526        let bytes = &[
527            0b0000_0001, // header, offset_size_minus_one=0 and version=1
528            4,           // dictionary_size
529            0x00,
530            0x02,
531            0x01, // Doesn't increase monotonically
532            0x10,
533            13,
534            b'c',
535            b'a',
536            b't',
537            b'd',
538            b'o',
539            b'g',
540            b'l',
541            b'a',
542            b'm',
543            b'b',
544            b'e',
545            b'e',
546            b'l',
547        ];
548
549        let err = VariantMetadata::try_new(bytes).unwrap_err();
550
551        assert!(
552            matches!(err, ArrowError::InvalidArgumentError(_)),
553            "unexpected error: {err:?}"
554        );
555    }
556
557    #[test]
558    fn try_new_truncated_offsets_inline() {
559        // Missing final offset
560        let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
561
562        let err = VariantMetadata::try_new(bytes).unwrap_err();
563        assert!(
564            matches!(err, ArrowError::InvalidArgumentError(_)),
565            "unexpected error: {err:?}"
566        );
567    }
568
569    #[test]
570    fn empty_string_is_valid() {
571        let bytes = &[
572            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
573            1,
574            0x00,
575            0x00,
576        ];
577        let metadata = VariantMetadata::try_new(bytes).unwrap();
578        assert_eq!(&metadata[0], "");
579
580        let bytes = &[
581            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
582            2,
583            0x00,
584            0x00,
585            0x02,
586            b'h',
587            b'i',
588        ];
589        let metadata = VariantMetadata::try_new(bytes).unwrap();
590        assert_eq!(&metadata[0], "");
591        assert_eq!(&metadata[1], "hi");
592
593        let bytes = &[
594            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
595            2,
596            0x00,
597            0x02,
598            0x02, // empty string is allowed, but must be first in a sorted dict
599            b'h',
600            b'i',
601        ];
602        let err = VariantMetadata::try_new(bytes).unwrap_err();
603        assert!(
604            matches!(err, ArrowError::InvalidArgumentError(_)),
605            "unexpected error: {err:?}"
606        );
607    }
608
609    #[test]
610    fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
611        // create a sorted object
612        let mut b = VariantBuilder::new();
613        let mut o = b.new_object();
614
615        o.insert("a", false);
616        o.insert("b", false);
617
618        o.finish();
619
620        let (m, _) = b.finish();
621
622        let m1 = VariantMetadata::new(&m);
623        assert!(m1.is_sorted());
624
625        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
626        // Since field names are not unique, it is considered not sorted.
627        let metadata_bytes = vec![
628            0b0000_0001,
629            3, // dictionary size
630            0, // "a"
631            1, // "a"
632            2, // "b"
633            3,
634            b'a',
635            b'a',
636            b'b',
637        ];
638        let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
639        assert!(!m2.is_sorted());
640
641        assert_ne!(m1, m2);
642    }
643
644    #[test]
645    fn test_compare_sorted_dictionary_with_sorted_dictionary() {
646        // create a sorted object
647        let mut b = VariantBuilder::new();
648        let mut o = b.new_object();
649
650        o.insert("a", false);
651        o.insert("b", false);
652
653        o.finish();
654
655        let (m, _) = b.finish();
656
657        let m1 = VariantMetadata::new(&m);
658        let m2 = VariantMetadata::new(&m);
659
660        assert_eq!(m1, m2);
661    }
662}