Skip to main content

parquet_variant/variant/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{OffsetSizeBytes, map_bytes_to_offsets};
19use crate::utils::{
20    first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21    try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29    version: u8,
30    is_sorted: bool,
31    /// Note: This is `offset_size_minus_one` + 1
32    offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43    // Hide the cast
44    const fn offset_size(&self) -> u32 {
45        self.offset_size as u32
46    }
47
48    // Avoid materializing this offset, since it's cheaply and safely computable
49    const fn first_offset_byte(&self) -> u32 {
50        NUM_HEADER_BYTES + self.offset_size()
51    }
52
53    /// Tries to construct the variant metadata header, which has the form
54    ///
55    /// ```text
56    ///              7     6  5   4  3             0
57    ///             +-------+---+---+---------------+
58    /// header      |       |   |   |    version    |
59    ///             +-------+---+---+---------------+
60    ///                 ^         ^
61    ///                 |         +-- sorted_strings
62    ///                 +-- offset_size_minus_one
63    /// ```
64    ///
65    /// The version is a 4-bit value that must always contain the value 1.
66    /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67    /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70        let version = header_byte & 0x0F; // First four bits
71        if version != CORRECT_VERSION_VALUE {
72            let err_msg = format!(
73                "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74            );
75            return Err(ArrowError::InvalidArgumentError(err_msg));
76        }
77        let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78        let offset_size_minus_one = header_byte >> 6; // Last two bits
79        Ok(Self {
80            version,
81            is_sorted,
82            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83        })
84    }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133    /// (Only) the bytes that make up this metadata instance.
134    pub(crate) bytes: &'m [u8],
135    header: VariantMetadataHeader,
136    dictionary_size: u32,
137    first_value_byte: u32,
138    validated: bool,
139}
140
141// We don't want this to grow because it increases the size of VariantList and VariantObject, which
142// could increase the size of Variant. All those size increases could hurt performance.
143#[cfg(target_pointer_width = "64")]
144const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
145
146#[cfg(target_pointer_width = "32")]
147const _: () = crate::utils::expect_size_of::<VariantMetadata>(20);
148
149/// The canonical byte slice corresponding to an empty metadata dictionary.
150///
151/// ```
152/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
153/// let mut metadata_builder = WritableMetadataBuilder::default();
154/// metadata_builder.finish();
155/// let metadata_bytes = metadata_builder.into_inner();
156/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
157/// ```
158pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];
159
160/// The empty metadata dictionary.
161///
162/// ```
163/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
164/// let mut metadata_builder = WritableMetadataBuilder::default();
165/// metadata_builder.finish();
166/// let metadata_bytes = metadata_builder.into_inner();
167/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
168/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
169/// ```
170pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
171    bytes: EMPTY_VARIANT_METADATA_BYTES,
172    header: VariantMetadataHeader {
173        version: CORRECT_VERSION_VALUE,
174        is_sorted: false,
175        offset_size: OffsetSizeBytes::One,
176    },
177    dictionary_size: 0,
178    first_value_byte: 3,
179    validated: true,
180};
181
182impl<'m> VariantMetadata<'m> {
183    /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
184    /// dictionary entries.
185    ///
186    /// [validation]: Self#Validation
187    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
188        Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
189    }
190
191    /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
192    /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
193    /// indexing and iteration could also panic if the underlying bytes are invalid.
194    ///
195    /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
196    /// were already validated previously by other means, or if the caller expects a small number of
197    /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
198    /// needed, instead of paying expensive full validation up front).
199    ///
200    /// [validate]: Self#Validation
201    pub fn new(bytes: &'m [u8]) -> Self {
202        Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
203    }
204
205    // The actual constructor, which performs only basic (constant-const) validation.
206    pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
207        let header_byte = first_byte_from_slice(bytes)?;
208        let header = VariantMetadataHeader::try_new(header_byte)?;
209
210        // First element after header is dictionary size; the offset array immediately follows.
211        let dictionary_size =
212            header
213                .offset_size
214                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
215
216        // Calculate the starting offset of the dictionary string bytes.
217        //
218        // There are dict_size + 1 offsets, and the value bytes immediately follow
219        // = (dict_size + 1) * offset_size + header.first_offset_byte()
220        let first_value_byte = dictionary_size
221            .checked_add(1)
222            .and_then(|n| n.checked_mul(header.offset_size()))
223            .and_then(|n| n.checked_add(header.first_offset_byte()))
224            .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
225
226        let mut new_self = Self {
227            bytes,
228            header,
229            dictionary_size,
230            first_value_byte,
231            validated: false,
232        };
233
234        // Validate just the first and last offset, ignoring the other offsets and all value bytes.
235        let first_offset = new_self.get_offset(0)?;
236        if first_offset != 0 {
237            return Err(ArrowError::InvalidArgumentError(format!(
238                "First offset is not zero: {first_offset}"
239            )));
240        }
241
242        // Use the last offset to upper-bound the byte slice
243        let last_offset = new_self
244            .get_offset(dictionary_size as _)?
245            .checked_add(first_value_byte)
246            .ok_or_else(|| overflow_error("variant metadata size"))?;
247        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
248        Ok(new_self)
249    }
250
251    /// The number of metadata dictionary entries
252    pub fn len(&self) -> usize {
253        self.dictionary_size as _
254    }
255
256    /// True if this metadata dictionary contains no entries
257    pub fn is_empty(&self) -> bool {
258        self.len() == 0
259    }
260
261    /// True if this instance is fully [validated] for panic-free infallible accesses.
262    ///
263    /// [validated]: Self#Validation
264    pub fn is_fully_validated(&self) -> bool {
265        self.validated
266    }
267
268    /// Performs a full [validation] of this metadata dictionary and returns the result.
269    ///
270    /// [validation]: Self#Validation
271    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
272        if !self.validated {
273            let offset_bytes = slice_from_slice(
274                self.bytes,
275                self.header.first_offset_byte() as _..self.first_value_byte as _,
276            )?;
277
278            // Verify the string values in the dictionary are UTF-8 encoded strings.
279            let value_buffer =
280                string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
281
282            let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
283
284            if self.header.is_sorted {
285                // Validate the dictionary values are unique and lexicographically sorted
286                //
287                // Since we use the offsets to access dictionary values, this also validates
288                // offsets are in-bounds and monotonically increasing
289                let mut current_offset = offsets.next().unwrap_or(0);
290                let mut prev_value: Option<&str> = None;
291                for next_offset in offsets {
292                    let current_value = value_buffer.get(current_offset..next_offset).ok_or_else(
293                        || {
294                            ArrowError::InvalidArgumentError(format!(
295                                "range {current_offset}..{next_offset} is invalid or out of bounds"
296                            ))
297                        },
298                    )?;
299
300                    if let Some(prev_val) = prev_value {
301                        if current_value <= prev_val {
302                            return Err(ArrowError::InvalidArgumentError(
303                                "dictionary values are not unique and ordered".to_string(),
304                            ));
305                        }
306                    }
307
308                    prev_value = Some(current_value);
309                    current_offset = next_offset;
310                }
311            } else {
312                // Validate offsets are in-bounds and monotonically increasing
313                //
314                // Since shallow validation ensures the first and last offsets are in bounds,
315                // we can also verify all offsets are in-bounds by checking if
316                // offsets are monotonically increasing
317                if !offsets.is_sorted_by(|a, b| a < b) {
318                    return Err(ArrowError::InvalidArgumentError(
319                        "offsets not monotonically increasing".to_string(),
320                    ));
321                }
322            }
323
324            self.validated = true;
325        }
326        Ok(self)
327    }
328
329    /// Whether the dictionary keys are sorted and unique
330    pub fn is_sorted(&self) -> bool {
331        self.header.is_sorted
332    }
333
334    /// The variant protocol version
335    pub const fn version(&self) -> u8 {
336        self.header.version
337    }
338
339    /// Gets an offset into the dictionary entry by index.
340    ///
341    /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
342    /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
343    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
344        let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
345        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
346        self.header.offset_size.unpack_u32(bytes, i)
347    }
348
349    /// Returns the total size, in bytes, of the metadata.
350    ///
351    /// Note this value may be smaller than what was passed to [`Self::new`] or
352    /// [`Self::try_new`] if the input was larger than necessary to encode the
353    /// metadata dictionary.
354    pub fn size(&self) -> usize {
355        self.bytes.len()
356    }
357
358    /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
359    /// underlying bytes are [invalid].
360    ///
361    /// [invalid]: Self#Validation
362    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
363        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
364        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
365    }
366
367    // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
368    // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
369    // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
370    fn get_impl(&self, i: usize) -> &'m str {
371        self.get(i).expect("Invalid metadata dictionary entry")
372    }
373
374    /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
375    /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
376    /// otherwise.
377    ///
378    /// WARNING: This method panics if the underlying bytes are [invalid].
379    ///
380    /// [invalid]: Self#Validation
381    pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
382        let field_id = if self.is_sorted() && self.len() > 10 {
383            // Binary search is faster for a not-tiny sorted metadata dictionary
384            let cmp = |i| Some(self.get_impl(i).cmp(field_name));
385            try_binary_search_range_by(0..self.len(), cmp)?.ok()?
386        } else {
387            // Fall back to Linear search for tiny or unsorted dictionary
388            (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
389        };
390        Some((field_id as u32, self.get_impl(field_id)))
391    }
392
393    /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
394    /// iterator encounters [invalid] data.
395    ///
396    /// [invalid]: Self#Validation
397    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
398        (0..self.len()).map(|i| self.get(i))
399    }
400
401    /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
402    /// [`Self::iter_try`] to avoid panics due to invalid data.
403    ///
404    /// [unvalidated]: Self#Validation
405    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
406        self.iter_try()
407            .map(|result| result.expect("Invalid metadata dictionary entry"))
408    }
409}
410
411/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
412/// [unvalidated] input could also panic if the underlying bytes are invalid.
413///
414/// [unvalidated]: Self#Validation
415impl std::ops::Index<usize> for VariantMetadata<'_> {
416    type Output = str;
417
418    fn index(&self, i: usize) -> &str {
419        self.get_impl(i)
420    }
421}
422
423#[cfg(test)]
424mod tests {
425
426    use crate::VariantBuilder;
427
428    use super::*;
429
430    /// `"cat"`, `"dog"` – valid metadata
431    #[test]
432    fn try_new_ok_inline() {
433        let bytes = &[
434            0b0000_0001, // header, offset_size_minus_one=0 and version=1
435            0x02,        // dictionary_size (2 strings)
436            0x00,
437            0x03,
438            0x06,
439            b'c',
440            b'a',
441            b't',
442            b'd',
443            b'o',
444            b'g',
445        ];
446
447        let md = VariantMetadata::try_new(bytes).expect("should parse");
448        assert_eq!(md.len(), 2);
449        // Fields
450        assert_eq!(&md[0], "cat");
451        assert_eq!(&md[1], "dog");
452
453        // Offsets
454        assert_eq!(md.get_offset(0).unwrap(), 0x00);
455        assert_eq!(md.get_offset(1).unwrap(), 0x03);
456        assert_eq!(md.get_offset(2).unwrap(), 0x06);
457
458        let err = md.get_offset(3).unwrap_err();
459        assert!(
460            matches!(err, ArrowError::InvalidArgumentError(_)),
461            "unexpected error: {err:?}"
462        );
463
464        let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
465        assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
466    }
467
468    /// Too short buffer test (missing one required offset).
469    /// Should error with "metadata shorter than dictionary_size implies".
470    #[test]
471    fn try_new_missing_last_value() {
472        let bytes = &[
473            0b0000_0001, // header, offset_size_minus_one=0 and version=1
474            0x02,        // dictionary_size = 2
475            0x00,
476            0x01,
477            0x02,
478            b'a',
479            b'b', // <-- we'll remove this
480        ];
481
482        let working_md = VariantMetadata::try_new(bytes).expect("should parse");
483        assert_eq!(working_md.len(), 2);
484        assert_eq!(&working_md[0], "a");
485        assert_eq!(&working_md[1], "b");
486
487        let truncated = &bytes[..bytes.len() - 1];
488
489        let err = VariantMetadata::try_new(truncated).unwrap_err();
490        assert!(
491            matches!(err, ArrowError::InvalidArgumentError(_)),
492            "unexpected error: {err:?}"
493        );
494    }
495
496    #[test]
497    fn try_new_fails_non_monotonic() {
498        // 'cat', 'dog', 'lamb'
499        let bytes = &[
500            0b0000_0001, // header, offset_size_minus_one=0 and version=1
501            0x03,        // dictionary_size
502            0x00,
503            0x02,
504            0x01, // Doesn't increase monotonically
505            0x10,
506            b'c',
507            b'a',
508            b't',
509            b'd',
510            b'o',
511            b'g',
512            b'l',
513            b'a',
514            b'm',
515            b'b',
516        ];
517
518        let err = VariantMetadata::try_new(bytes).unwrap_err();
519        assert!(
520            matches!(err, ArrowError::InvalidArgumentError(_)),
521            "unexpected error: {err:?}"
522        );
523    }
524
525    #[test]
526    fn try_new_fails_non_monotonic2() {
527        // this test case checks whether offsets are monotonic in the full validation logic.
528
529        // 'cat', 'dog', 'lamb', "eel"
530        let bytes = &[
531            0b0000_0001, // header, offset_size_minus_one=0 and version=1
532            4,           // dictionary_size
533            0x00,
534            0x02,
535            0x01, // Doesn't increase monotonically
536            0x10,
537            13,
538            b'c',
539            b'a',
540            b't',
541            b'd',
542            b'o',
543            b'g',
544            b'l',
545            b'a',
546            b'm',
547            b'b',
548            b'e',
549            b'e',
550            b'l',
551        ];
552
553        let err = VariantMetadata::try_new(bytes).unwrap_err();
554
555        assert!(
556            matches!(err, ArrowError::InvalidArgumentError(_)),
557            "unexpected error: {err:?}"
558        );
559    }
560
561    #[test]
562    fn try_new_truncated_offsets_inline() {
563        // Missing final offset
564        let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
565
566        let err = VariantMetadata::try_new(bytes).unwrap_err();
567        assert!(
568            matches!(err, ArrowError::InvalidArgumentError(_)),
569            "unexpected error: {err:?}"
570        );
571    }
572
573    #[test]
574    fn empty_string_is_valid() {
575        let bytes = &[
576            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
577            1,
578            0x00,
579            0x00,
580        ];
581        let metadata = VariantMetadata::try_new(bytes).unwrap();
582        assert_eq!(&metadata[0], "");
583
584        let bytes = &[
585            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
586            2,
587            0x00,
588            0x00,
589            0x02,
590            b'h',
591            b'i',
592        ];
593        let metadata = VariantMetadata::try_new(bytes).unwrap();
594        assert_eq!(&metadata[0], "");
595        assert_eq!(&metadata[1], "hi");
596
597        let bytes = &[
598            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
599            2,
600            0x00,
601            0x02,
602            0x02, // empty string is allowed, but must be first in a sorted dict
603            b'h',
604            b'i',
605        ];
606        let err = VariantMetadata::try_new(bytes).unwrap_err();
607        assert!(
608            matches!(err, ArrowError::InvalidArgumentError(_)),
609            "unexpected error: {err:?}"
610        );
611    }
612
613    #[test]
614    fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
615        // create a sorted object
616        let mut b = VariantBuilder::new();
617        let mut o = b.new_object();
618
619        o.insert("a", false);
620        o.insert("b", false);
621
622        o.finish();
623
624        let (m, _) = b.finish();
625
626        let m1 = VariantMetadata::new(&m);
627        assert!(m1.is_sorted());
628
629        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
630        // Since field names are not unique, it is considered not sorted.
631        let metadata_bytes = vec![
632            0b0000_0001,
633            3, // dictionary size
634            0, // "a"
635            1, // "a"
636            2, // "b"
637            3,
638            b'a',
639            b'a',
640            b'b',
641        ];
642        let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
643        assert!(!m2.is_sorted());
644
645        assert_ne!(m1, m2);
646    }
647
648    #[test]
649    fn test_compare_sorted_dictionary_with_sorted_dictionary() {
650        // create a sorted object
651        let mut b = VariantBuilder::new();
652        let mut o = b.new_object();
653
654        o.insert("a", false);
655        o.insert("b", false);
656
657        o.finish();
658
659        let (m, _) = b.finish();
660
661        let m1 = VariantMetadata::new(&m);
662        let m2 = VariantMetadata::new(&m);
663
664        assert_eq!(m1, m2);
665    }
666}