parquet_variant/variant/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{
20    first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21    try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29    version: u8,
30    is_sorted: bool,
31    /// Note: This is `offset_size_minus_one` + 1
32    offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43    // Hide the cast
44    const fn offset_size(&self) -> u32 {
45        self.offset_size as u32
46    }
47
48    // Avoid materializing this offset, since it's cheaply and safely computable
49    const fn first_offset_byte(&self) -> u32 {
50        NUM_HEADER_BYTES + self.offset_size()
51    }
52
53    /// Tries to construct the variant metadata header, which has the form
54    ///
55    /// ```text
56    ///              7     6  5   4  3             0
57    ///             +-------+---+---+---------------+
58    /// header      |       |   |   |    version    |
59    ///             +-------+---+---+---------------+
60    ///                 ^         ^
61    ///                 |         +-- sorted_strings
62    ///                 +-- offset_size_minus_one
63    /// ```
64    ///
65    /// The version is a 4-bit value that must always contain the value 1.
66    /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67    /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70        let version = header_byte & 0x0F; // First four bits
71        if version != CORRECT_VERSION_VALUE {
72            let err_msg = format!(
73                "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74            );
75            return Err(ArrowError::InvalidArgumentError(err_msg));
76        }
77        let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78        let offset_size_minus_one = header_byte >> 6; // Last two bits
79        Ok(Self {
80            version,
81            is_sorted,
82            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83        })
84    }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133    pub(crate) bytes: &'m [u8],
134    header: VariantMetadataHeader,
135    dictionary_size: u32,
136    first_value_byte: u32,
137    validated: bool,
138}
139
140// We don't want this to grow because it increases the size of VariantList and VariantObject, which
141// could increase the size of Variant. All those size increases could hurt performance.
142const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
143
144impl<'m> VariantMetadata<'m> {
145    /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
146    /// dictionary entries.
147    ///
148    /// [validation]: Self#Validation
149    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
150        Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
151    }
152
153    /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
154    /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
155    /// indexing and iteration could also panic if the underlying bytes are invalid.
156    ///
157    /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
158    /// were already validated previously by other means, or if the caller expects a small number of
159    /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
160    /// needed, instead of paying expensive full validation up front).
161    ///
162    /// [validate]: Self#Validation
163    pub fn new(bytes: &'m [u8]) -> Self {
164        Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
165    }
166
167    // The actual constructor, which performs only basic (constant-const) validation.
168    pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
169        let header_byte = first_byte_from_slice(bytes)?;
170        let header = VariantMetadataHeader::try_new(header_byte)?;
171
172        // First element after header is dictionary size; the offset array immediately follows.
173        let dictionary_size =
174            header
175                .offset_size
176                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
177
178        // Calculate the starting offset of the dictionary string bytes.
179        //
180        // There are dict_size + 1 offsets, and the value bytes immediately follow
181        // = (dict_size + 1) * offset_size + header.first_offset_byte()
182        let first_value_byte = dictionary_size
183            .checked_add(1)
184            .and_then(|n| n.checked_mul(header.offset_size()))
185            .and_then(|n| n.checked_add(header.first_offset_byte()))
186            .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
187
188        let mut new_self = Self {
189            bytes,
190            header,
191            dictionary_size,
192            first_value_byte,
193            validated: false,
194        };
195
196        // Validate just the first and last offset, ignoring the other offsets and all value bytes.
197        let first_offset = new_self.get_offset(0)?;
198        if first_offset != 0 {
199            return Err(ArrowError::InvalidArgumentError(format!(
200                "First offset is not zero: {first_offset}"
201            )));
202        }
203
204        // Use the last offset to upper-bound the byte slice
205        let last_offset = new_self
206            .get_offset(dictionary_size as _)?
207            .checked_add(first_value_byte)
208            .ok_or_else(|| overflow_error("variant metadata size"))?;
209        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
210        Ok(new_self)
211    }
212
213    /// The number of metadata dictionary entries
214    pub fn len(&self) -> usize {
215        self.dictionary_size as _
216    }
217
218    /// True if this metadata dictionary contains no entries
219    pub fn is_empty(&self) -> bool {
220        self.len() == 0
221    }
222
223    /// True if this instance is fully [validated] for panic-free infallible accesses.
224    ///
225    /// [validated]: Self#Validation
226    pub fn is_fully_validated(&self) -> bool {
227        self.validated
228    }
229
230    /// Performs a full [validation] of this metadata dictionary and returns the result.
231    ///
232    /// [validation]: Self#Validation
233    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
234        if !self.validated {
235            let offset_bytes = slice_from_slice(
236                self.bytes,
237                self.header.first_offset_byte() as _..self.first_value_byte as _,
238            )?;
239
240            // Verify the string values in the dictionary are UTF-8 encoded strings.
241            let value_buffer =
242                string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
243
244            let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
245
246            if self.header.is_sorted {
247                // Validate the dictionary values are unique and lexicographically sorted
248                //
249                // Since we use the offsets to access dictionary values, this also validates
250                // offsets are in-bounds and monotonically increasing
251                let mut current_offset = offsets.next().unwrap_or(0);
252                let mut prev_value: Option<&str> = None;
253                for next_offset in offsets {
254                    let current_value =
255                        value_buffer
256                            .get(current_offset..next_offset)
257                            .ok_or_else(|| {
258                                ArrowError::InvalidArgumentError(format!(
259                                "range {current_offset}..{next_offset} is invalid or out of bounds"
260                            ))
261                            })?;
262
263                    if let Some(prev_val) = prev_value {
264                        if current_value <= prev_val {
265                            return Err(ArrowError::InvalidArgumentError(
266                                "dictionary values are not unique and ordered".to_string(),
267                            ));
268                        }
269                    }
270
271                    prev_value = Some(current_value);
272                    current_offset = next_offset;
273                }
274            } else {
275                // Validate offsets are in-bounds and monotonically increasing
276                //
277                // Since shallow validation ensures the first and last offsets are in bounds,
278                // we can also verify all offsets are in-bounds by checking if
279                // offsets are monotonically increasing
280                if !offsets.is_sorted_by(|a, b| a < b) {
281                    return Err(ArrowError::InvalidArgumentError(
282                        "offsets not monotonically increasing".to_string(),
283                    ));
284                }
285            }
286
287            self.validated = true;
288        }
289        Ok(self)
290    }
291
292    /// Whether the dictionary keys are sorted and unique
293    pub fn is_sorted(&self) -> bool {
294        self.header.is_sorted
295    }
296
297    /// The variant protocol version
298    pub const fn version(&self) -> u8 {
299        self.header.version
300    }
301
302    /// Gets an offset array entry by index.
303    ///
304    /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
305    /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
306    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
307        let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
308        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
309        self.header.offset_size.unpack_u32(bytes, i)
310    }
311
312    /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
313    /// underlying bytes are [invalid].
314    ///
315    /// [invalid]: Self#Validation
316    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
317        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
318        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
319    }
320
321    // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
322    // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
323    // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
324    fn get_impl(&self, i: usize) -> &'m str {
325        self.get(i).expect("Invalid metadata dictionary entry")
326    }
327
328    /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
329    /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
330    /// otherwise.
331    ///
332    /// WARNING: This method panics if the underlying bytes are [invalid].
333    ///
334    /// [invalid]: Self#Validation
335    pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
336        let field_id = if self.is_sorted() && self.len() > 10 {
337            // Binary search is faster for a not-tiny sorted metadata dictionary
338            let cmp = |i| Some(self.get_impl(i).cmp(field_name));
339            try_binary_search_range_by(0..self.len(), cmp)?.ok()?
340        } else {
341            // Fall back to Linear search for tiny or unsorted dictionary
342            (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
343        };
344        Some((field_id as u32, self.get_impl(field_id)))
345    }
346
347    /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
348    /// iterator encounters [invalid] data.
349    ///
350    /// [invalid]: Self#Validation
351    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
352        (0..self.len()).map(|i| self.get(i))
353    }
354
355    /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
356    /// [`Self::iter_try`] to avoid panics due to invalid data.
357    ///
358    /// [unvalidated]: Self#Validation
359    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
360        self.iter_try()
361            .map(|result| result.expect("Invalid metadata dictionary entry"))
362    }
363}
364
365/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
366/// [unvalidated] input could also panic if the underlying bytes are invalid.
367///
368/// [unvalidated]: Self#Validation
369impl std::ops::Index<usize> for VariantMetadata<'_> {
370    type Output = str;
371
372    fn index(&self, i: usize) -> &str {
373        self.get_impl(i)
374    }
375}
376
377#[cfg(test)]
378mod tests {
379
380    use crate::VariantBuilder;
381
382    use super::*;
383
384    /// `"cat"`, `"dog"` – valid metadata
385    #[test]
386    fn try_new_ok_inline() {
387        let bytes = &[
388            0b0000_0001, // header, offset_size_minus_one=0 and version=1
389            0x02,        // dictionary_size (2 strings)
390            0x00,
391            0x03,
392            0x06,
393            b'c',
394            b'a',
395            b't',
396            b'd',
397            b'o',
398            b'g',
399        ];
400
401        let md = VariantMetadata::try_new(bytes).expect("should parse");
402        assert_eq!(md.len(), 2);
403        // Fields
404        assert_eq!(&md[0], "cat");
405        assert_eq!(&md[1], "dog");
406
407        // Offsets
408        assert_eq!(md.get_offset(0).unwrap(), 0x00);
409        assert_eq!(md.get_offset(1).unwrap(), 0x03);
410        assert_eq!(md.get_offset(2).unwrap(), 0x06);
411
412        let err = md.get_offset(3).unwrap_err();
413        assert!(
414            matches!(err, ArrowError::InvalidArgumentError(_)),
415            "unexpected error: {err:?}"
416        );
417
418        let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
419        assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
420    }
421
422    /// Too short buffer test (missing one required offset).
423    /// Should error with "metadata shorter than dictionary_size implies".
424    #[test]
425    fn try_new_missing_last_value() {
426        let bytes = &[
427            0b0000_0001, // header, offset_size_minus_one=0 and version=1
428            0x02,        // dictionary_size = 2
429            0x00,
430            0x01,
431            0x02,
432            b'a',
433            b'b', // <-- we'll remove this
434        ];
435
436        let working_md = VariantMetadata::try_new(bytes).expect("should parse");
437        assert_eq!(working_md.len(), 2);
438        assert_eq!(&working_md[0], "a");
439        assert_eq!(&working_md[1], "b");
440
441        let truncated = &bytes[..bytes.len() - 1];
442
443        let err = VariantMetadata::try_new(truncated).unwrap_err();
444        assert!(
445            matches!(err, ArrowError::InvalidArgumentError(_)),
446            "unexpected error: {err:?}"
447        );
448    }
449
450    #[test]
451    fn try_new_fails_non_monotonic() {
452        // 'cat', 'dog', 'lamb'
453        let bytes = &[
454            0b0000_0001, // header, offset_size_minus_one=0 and version=1
455            0x03,        // dictionary_size
456            0x00,
457            0x02,
458            0x01, // Doesn't increase monotonically
459            0x10,
460            b'c',
461            b'a',
462            b't',
463            b'd',
464            b'o',
465            b'g',
466            b'l',
467            b'a',
468            b'm',
469            b'b',
470        ];
471
472        let err = VariantMetadata::try_new(bytes).unwrap_err();
473        assert!(
474            matches!(err, ArrowError::InvalidArgumentError(_)),
475            "unexpected error: {err:?}"
476        );
477    }
478
479    #[test]
480    fn try_new_fails_non_monotonic2() {
481        // this test case checks whether offsets are monotonic in the full validation logic.
482
483        // 'cat', 'dog', 'lamb', "eel"
484        let bytes = &[
485            0b0000_0001, // header, offset_size_minus_one=0 and version=1
486            4,           // dictionary_size
487            0x00,
488            0x02,
489            0x01, // Doesn't increase monotonically
490            0x10,
491            13,
492            b'c',
493            b'a',
494            b't',
495            b'd',
496            b'o',
497            b'g',
498            b'l',
499            b'a',
500            b'm',
501            b'b',
502            b'e',
503            b'e',
504            b'l',
505        ];
506
507        let err = VariantMetadata::try_new(bytes).unwrap_err();
508
509        assert!(
510            matches!(err, ArrowError::InvalidArgumentError(_)),
511            "unexpected error: {err:?}"
512        );
513    }
514
515    #[test]
516    fn try_new_truncated_offsets_inline() {
517        // Missing final offset
518        let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
519
520        let err = VariantMetadata::try_new(bytes).unwrap_err();
521        assert!(
522            matches!(err, ArrowError::InvalidArgumentError(_)),
523            "unexpected error: {err:?}"
524        );
525    }
526
527    #[test]
528    fn empty_string_is_valid() {
529        let bytes = &[
530            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
531            1,
532            0x00,
533            0x00,
534        ];
535        let metadata = VariantMetadata::try_new(bytes).unwrap();
536        assert_eq!(&metadata[0], "");
537
538        let bytes = &[
539            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
540            2,
541            0x00,
542            0x00,
543            0x02,
544            b'h',
545            b'i',
546        ];
547        let metadata = VariantMetadata::try_new(bytes).unwrap();
548        assert_eq!(&metadata[0], "");
549        assert_eq!(&metadata[1], "hi");
550
551        let bytes = &[
552            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
553            2,
554            0x00,
555            0x02,
556            0x02, // empty string is allowed, but must be first in a sorted dict
557            b'h',
558            b'i',
559        ];
560        let err = VariantMetadata::try_new(bytes).unwrap_err();
561        assert!(
562            matches!(err, ArrowError::InvalidArgumentError(_)),
563            "unexpected error: {err:?}"
564        );
565    }
566
567    #[test]
568    fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
569        // create a sorted object
570        let mut b = VariantBuilder::new();
571        let mut o = b.new_object();
572
573        o.insert("a", false);
574        o.insert("b", false);
575
576        o.finish();
577
578        let (m, _) = b.finish();
579
580        let m1 = VariantMetadata::new(&m);
581        assert!(m1.is_sorted());
582
583        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
584        // Since field names are not unique, it is considered not sorted.
585        let metadata_bytes = vec![
586            0b0000_0001,
587            3, // dictionary size
588            0, // "a"
589            1, // "a"
590            2, // "b"
591            3,
592            b'a',
593            b'a',
594            b'b',
595        ];
596        let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
597        assert!(!m2.is_sorted());
598
599        assert_ne!(m1, m2);
600    }
601
602    #[test]
603    fn test_compare_sorted_dictionary_with_sorted_dictionary() {
604        // create a sorted object
605        let mut b = VariantBuilder::new();
606        let mut o = b.new_object();
607
608        o.insert("a", false);
609        o.insert("b", false);
610
611        o.finish();
612
613        let (m, _) = b.finish();
614
615        let m1 = VariantMetadata::new(&m);
616        let m2 = VariantMetadata::new(&m);
617
618        assert_eq!(m1, m2);
619    }
620}