parquet_variant/variant/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};
20
21use arrow_schema::ArrowError;
22
23/// Header structure for [`VariantMetadata`]
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub(crate) struct VariantMetadataHeader {
26    version: u8,
27    is_sorted: bool,
28    /// Note: This is `offset_size_minus_one` + 1
29    offset_size: OffsetSizeBytes,
30}
31
32// According to the spec this is currently always = 1, and so we store this const for validation
33// purposes and to make that visible.
34const CORRECT_VERSION_VALUE: u8 = 1;
35
36// The metadata header occupies one byte; use a named constant for readability
37const NUM_HEADER_BYTES: u32 = 1;
38
39impl VariantMetadataHeader {
40    // Hide the cast
41    const fn offset_size(&self) -> u32 {
42        self.offset_size as u32
43    }
44
45    // Avoid materializing this offset, since it's cheaply and safely computable
46    const fn first_offset_byte(&self) -> u32 {
47        NUM_HEADER_BYTES + self.offset_size()
48    }
49
50    /// Tries to construct the variant metadata header, which has the form
51    ///
52    /// ```text
53    ///              7     6  5   4  3             0
54    ///             +-------+---+---+---------------+
55    /// header      |       |   |   |    version    |
56    ///             +-------+---+---+---------------+
57    ///                 ^         ^
58    ///                 |         +-- sorted_strings
59    ///                 +-- offset_size_minus_one
60    /// ```
61    ///
62    /// The version is a 4-bit value that must always contain the value 1.
63    /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
64    /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
65    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
66    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
67        let version = header_byte & 0x0F; // First four bits
68        if version != CORRECT_VERSION_VALUE {
69            let err_msg = format!(
70                "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
71            );
72            return Err(ArrowError::InvalidArgumentError(err_msg));
73        }
74        let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
75        let offset_size_minus_one = header_byte >> 6; // Last two bits
76        Ok(Self {
77            version,
78            is_sorted,
79            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
80        })
81    }
82}
83
84/// [`Variant`] Metadata
85///
86/// See the [Variant Spec] file for more information
87///
88/// # Validation
89///
90/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
91/// underlying bytes are a valid encoding of variant metadata (see below).
92///
93/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
94/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
95/// validation cost is linear in the number of underlying bytes.
96///
97/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
98/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
99/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
100/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
101/// _unvalidated_ instance, if desired.
102///
103/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
104/// knows the underlying bytes were already validated previously, or if the caller intends to
105/// perform a small number of (fallible) accesses to a large dictionary.
106///
107/// A _validated_ variant [metadata instance guarantees that:
108///
109/// - header byte is valid
110/// - dictionary size is in bounds
111/// - offset array content is in-bounds
112/// - first offset is zero
113/// - last offset is in-bounds
114/// - all other offsets are in-bounds (*)
115/// - all offsets are monotonically increasing (*)
116/// - all values are valid utf-8 (*)
117///
118/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
119/// in the list above); it panics any of the other checks fails.
120///
121/// # Safety
122///
123/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
124/// it with infallible methods may cause panics but will never lead to undefined behavior.
125///
126/// [`Variant`]: crate::Variant
127/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
128#[derive(Debug, Clone, PartialEq)]
129pub struct VariantMetadata<'m> {
130    pub(crate) bytes: &'m [u8],
131    header: VariantMetadataHeader,
132    dictionary_size: u32,
133    first_value_byte: u32,
134    validated: bool,
135}
136
137// We don't want this to grow because it increases the size of VariantList and VariantObject, which
138// could increase the size of Variant. All those size increases could hurt performance.
139const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
140
141impl<'m> VariantMetadata<'m> {
142    /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
143    /// dictionary entries.
144    ///
145    /// [validation]: Self#Validation
146    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
147        Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
148    }
149
150    /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
151    /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
152    /// indexing and iteration could also panic if the underlying bytes are invalid.
153    ///
154    /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
155    /// were already validated previously by other means, or if the caller expects a small number of
156    /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
157    /// needed, instead of paying expensive full validation up front).
158    ///
159    /// [validate]: Self#Validation
160    pub fn new(bytes: &'m [u8]) -> Self {
161        Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
162    }
163
164    // The actual constructor, which performs only basic (constant-const) validation.
165    pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
166        let header_byte = first_byte_from_slice(bytes)?;
167        let header = VariantMetadataHeader::try_new(header_byte)?;
168
169        // First element after header is dictionary size; the offset array immediately follows.
170        let dictionary_size =
171            header
172                .offset_size
173                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
174
175        // Calculate the starting offset of the dictionary string bytes.
176        //
177        // There are dict_size + 1 offsets, and the value bytes immediately follow
178        // = (dict_size + 1) * offset_size + header.first_offset_byte()
179        let first_value_byte = dictionary_size
180            .checked_add(1)
181            .and_then(|n| n.checked_mul(header.offset_size()))
182            .and_then(|n| n.checked_add(header.first_offset_byte()))
183            .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
184
185        let mut new_self = Self {
186            bytes,
187            header,
188            dictionary_size,
189            first_value_byte,
190            validated: false,
191        };
192
193        // Validate just the first and last offset, ignoring the other offsets and all value bytes.
194        let first_offset = new_self.get_offset(0)?;
195        if first_offset != 0 {
196            return Err(ArrowError::InvalidArgumentError(format!(
197                "First offset is not zero: {first_offset}"
198            )));
199        }
200
201        // Use the last offset to upper-bound the byte slice
202        let last_offset = new_self
203            .get_offset(dictionary_size as _)?
204            .checked_add(first_value_byte)
205            .ok_or_else(|| overflow_error("variant metadata size"))?;
206        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
207        Ok(new_self)
208    }
209
210    /// The number of metadata dictionary entries
211    pub fn len(&self) -> usize {
212        self.dictionary_size as _
213    }
214
215    /// True if this metadata dictionary contains no entries
216    pub fn is_empty(&self) -> bool {
217        self.len() == 0
218    }
219
220    /// True if this instance is fully [validated] for panic-free infallible accesses.
221    ///
222    /// [validated]: Self#Validation
223    pub fn is_fully_validated(&self) -> bool {
224        self.validated
225    }
226
227    /// Performs a full [validation] of this metadata dictionary and returns the result.
228    ///
229    /// [validation]: Self#Validation
230    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
231        if !self.validated {
232            let offset_bytes = slice_from_slice(
233                self.bytes,
234                self.header.first_offset_byte() as _..self.first_value_byte as _,
235            )?;
236
237            // Verify the string values in the dictionary are UTF-8 encoded strings.
238            let value_buffer =
239                string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
240
241            let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
242
243            if self.header.is_sorted {
244                // Validate the dictionary values are unique and lexicographically sorted
245                //
246                // Since we use the offsets to access dictionary values, this also validates
247                // offsets are in-bounds and monotonically increasing
248                let mut current_offset = offsets.next().unwrap_or(0);
249                let mut prev_value: Option<&str> = None;
250                for next_offset in offsets {
251                    let current_value =
252                        value_buffer
253                            .get(current_offset..next_offset)
254                            .ok_or_else(|| {
255                                ArrowError::InvalidArgumentError(format!(
256                                "range {current_offset}..{next_offset} is invalid or out of bounds"
257                            ))
258                            })?;
259
260                    if let Some(prev_val) = prev_value {
261                        if current_value <= prev_val {
262                            return Err(ArrowError::InvalidArgumentError(
263                                "dictionary values are not unique and ordered".to_string(),
264                            ));
265                        }
266                    }
267
268                    prev_value = Some(current_value);
269                    current_offset = next_offset;
270                }
271            } else {
272                // Validate offsets are in-bounds and monotonically increasing
273                //
274                // Since shallow validation ensures the first and last offsets are in bounds,
275                // we can also verify all offsets are in-bounds by checking if
276                // offsets are monotonically increasing
277                if !offsets.is_sorted_by(|a, b| a < b) {
278                    return Err(ArrowError::InvalidArgumentError(
279                        "offsets not monotonically increasing".to_string(),
280                    ));
281                }
282            }
283
284            self.validated = true;
285        }
286        Ok(self)
287    }
288
289    /// Whether the dictionary keys are sorted and unique
290    pub fn is_sorted(&self) -> bool {
291        self.header.is_sorted
292    }
293
294    /// The variant protocol version
295    pub const fn version(&self) -> u8 {
296        self.header.version
297    }
298
299    /// Gets an offset array entry by index.
300    ///
301    /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
302    /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
303    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
304        let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
305        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
306        self.header.offset_size.unpack_u32(bytes, i)
307    }
308
309    /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
310    /// underlying bytes are [invalid].
311    ///
312    /// [invalid]: Self#Validation
313    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
314        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
315        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
316    }
317
318    /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
319    /// iterator encounters [invalid] data.
320    ///
321    /// [invalid]: Self#Validation
322    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
323        (0..self.len()).map(|i| self.get(i))
324    }
325
326    /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
327    /// [`Self::iter_try`] to avoid panics due to invalid data.
328    ///
329    /// [unvalidated]: Self#Validation
330    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
331        self.iter_try()
332            .map(|result| result.expect("Invalid metadata dictionary entry"))
333    }
334}
335
336/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
337/// [unvalidated] input could also panic if the underlying bytes are invalid.
338///
339/// [unvalidated]: Self#Validation
340impl std::ops::Index<usize> for VariantMetadata<'_> {
341    type Output = str;
342
343    fn index(&self, i: usize) -> &str {
344        self.get(i).expect("Invalid metadata dictionary entry")
345    }
346}
347
348#[cfg(test)]
349mod tests {
350
351    use crate::VariantBuilder;
352
353    use super::*;
354
355    /// `"cat"`, `"dog"` – valid metadata
356    #[test]
357    fn try_new_ok_inline() {
358        let bytes = &[
359            0b0000_0001, // header, offset_size_minus_one=0 and version=1
360            0x02,        // dictionary_size (2 strings)
361            0x00,
362            0x03,
363            0x06,
364            b'c',
365            b'a',
366            b't',
367            b'd',
368            b'o',
369            b'g',
370        ];
371
372        let md = VariantMetadata::try_new(bytes).expect("should parse");
373        assert_eq!(md.len(), 2);
374        // Fields
375        assert_eq!(&md[0], "cat");
376        assert_eq!(&md[1], "dog");
377
378        // Offsets
379        assert_eq!(md.get_offset(0).unwrap(), 0x00);
380        assert_eq!(md.get_offset(1).unwrap(), 0x03);
381        assert_eq!(md.get_offset(2).unwrap(), 0x06);
382
383        let err = md.get_offset(3).unwrap_err();
384        assert!(
385            matches!(err, ArrowError::InvalidArgumentError(_)),
386            "unexpected error: {err:?}"
387        );
388
389        let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
390        assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
391    }
392
393    /// Too short buffer test (missing one required offset).
394    /// Should error with "metadata shorter than dictionary_size implies".
395    #[test]
396    fn try_new_missing_last_value() {
397        let bytes = &[
398            0b0000_0001, // header, offset_size_minus_one=0 and version=1
399            0x02,        // dictionary_size = 2
400            0x00,
401            0x01,
402            0x02,
403            b'a',
404            b'b', // <-- we'll remove this
405        ];
406
407        let working_md = VariantMetadata::try_new(bytes).expect("should parse");
408        assert_eq!(working_md.len(), 2);
409        assert_eq!(&working_md[0], "a");
410        assert_eq!(&working_md[1], "b");
411
412        let truncated = &bytes[..bytes.len() - 1];
413
414        let err = VariantMetadata::try_new(truncated).unwrap_err();
415        assert!(
416            matches!(err, ArrowError::InvalidArgumentError(_)),
417            "unexpected error: {err:?}"
418        );
419    }
420
421    #[test]
422    fn try_new_fails_non_monotonic() {
423        // 'cat', 'dog', 'lamb'
424        let bytes = &[
425            0b0000_0001, // header, offset_size_minus_one=0 and version=1
426            0x03,        // dictionary_size
427            0x00,
428            0x02,
429            0x01, // Doesn't increase monotonically
430            0x10,
431            b'c',
432            b'a',
433            b't',
434            b'd',
435            b'o',
436            b'g',
437            b'l',
438            b'a',
439            b'm',
440            b'b',
441        ];
442
443        let err = VariantMetadata::try_new(bytes).unwrap_err();
444        assert!(
445            matches!(err, ArrowError::InvalidArgumentError(_)),
446            "unexpected error: {err:?}"
447        );
448    }
449
450    #[test]
451    fn try_new_fails_non_monotonic2() {
452        // this test case checks whether offsets are monotonic in the full validation logic.
453
454        // 'cat', 'dog', 'lamb', "eel"
455        let bytes = &[
456            0b0000_0001, // header, offset_size_minus_one=0 and version=1
457            4,           // dictionary_size
458            0x00,
459            0x02,
460            0x01, // Doesn't increase monotonically
461            0x10,
462            13,
463            b'c',
464            b'a',
465            b't',
466            b'd',
467            b'o',
468            b'g',
469            b'l',
470            b'a',
471            b'm',
472            b'b',
473            b'e',
474            b'e',
475            b'l',
476        ];
477
478        let err = VariantMetadata::try_new(bytes).unwrap_err();
479
480        assert!(
481            matches!(err, ArrowError::InvalidArgumentError(_)),
482            "unexpected error: {err:?}"
483        );
484    }
485
486    #[test]
487    fn try_new_truncated_offsets_inline() {
488        // Missing final offset
489        let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
490
491        let err = VariantMetadata::try_new(bytes).unwrap_err();
492        assert!(
493            matches!(err, ArrowError::InvalidArgumentError(_)),
494            "unexpected error: {err:?}"
495        );
496    }
497
498    #[test]
499    fn empty_string_is_valid() {
500        let bytes = &[
501            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
502            1,
503            0x00,
504            0x00,
505        ];
506        let metadata = VariantMetadata::try_new(bytes).unwrap();
507        assert_eq!(&metadata[0], "");
508
509        let bytes = &[
510            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
511            2,
512            0x00,
513            0x00,
514            0x02,
515            b'h',
516            b'i',
517        ];
518        let metadata = VariantMetadata::try_new(bytes).unwrap();
519        assert_eq!(&metadata[0], "");
520        assert_eq!(&metadata[1], "hi");
521
522        let bytes = &[
523            0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
524            2,
525            0x00,
526            0x02,
527            0x02, // empty string is allowed, but must be first in a sorted dict
528            b'h',
529            b'i',
530        ];
531        let err = VariantMetadata::try_new(bytes).unwrap_err();
532        assert!(
533            matches!(err, ArrowError::InvalidArgumentError(_)),
534            "unexpected error: {err:?}"
535        );
536    }
537
538    #[test]
539    fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
540        // create a sorted object
541        let mut b = VariantBuilder::new();
542        let mut o = b.new_object();
543
544        o.insert("a", false);
545        o.insert("b", false);
546
547        o.finish().unwrap();
548
549        let (m, _) = b.finish();
550
551        let m1 = VariantMetadata::new(&m);
552        assert!(m1.is_sorted());
553
554        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
555        // Since field names are not unique, it is considered not sorted.
556        let metadata_bytes = vec![
557            0b0000_0001,
558            3, // dictionary size
559            0, // "a"
560            1, // "a"
561            2, // "b"
562            3,
563            b'a',
564            b'a',
565            b'b',
566        ];
567        let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
568        assert!(!m2.is_sorted());
569
570        assert_ne!(m1, m2);
571    }
572
573    #[test]
574    fn test_compare_sorted_dictionary_with_sorted_dictionary() {
575        // create a sorted object
576        let mut b = VariantBuilder::new();
577        let mut o = b.new_object();
578
579        o.insert("a", false);
580        o.insert("b", false);
581
582        o.finish().unwrap();
583
584        let (m, _) = b.finish();
585
586        let m1 = VariantMetadata::new(&m);
587        let m2 = VariantMetadata::new(&m);
588
589        assert_eq!(m1, m2);
590    }
591}