parquet_variant/variant/
metadata.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};
20
21use arrow_schema::ArrowError;
22
23/// Header structure for [`VariantMetadata`]
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub(crate) struct VariantMetadataHeader {
26    version: u8,
27    is_sorted: bool,
28    /// Note: This is `offset_size_minus_one` + 1
29    offset_size: OffsetSizeBytes,
30}
31
32// According to the spec this is currently always = 1, and so we store this const for validation
33// purposes and to make that visible.
34const CORRECT_VERSION_VALUE: u8 = 1;
35
36// The metadata header occupies one byte; use a named constant for readability
37const NUM_HEADER_BYTES: u32 = 1;
38
39impl VariantMetadataHeader {
40    // Hide the cast
41    const fn offset_size(&self) -> u32 {
42        self.offset_size as u32
43    }
44
45    // Avoid materializing this offset, since it's cheaply and safely computable
46    const fn first_offset_byte(&self) -> u32 {
47        NUM_HEADER_BYTES + self.offset_size()
48    }
49
50    /// Tries to construct the variant metadata header, which has the form
51    ///
52    /// ```text
53    ///              7     6  5   4  3             0
54    ///             +-------+---+---+---------------+
55    /// header      |       |   |   |    version    |
56    ///             +-------+---+---+---------------+
57    ///                 ^         ^
58    ///                 |         +-- sorted_strings
59    ///                 +-- offset_size_minus_one
60    /// ```
61    ///
62    /// The version is a 4-bit value that must always contain the value 1.
63    /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
64    /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
65    /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
66    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
67        let version = header_byte & 0x0F; // First four bits
68        if version != CORRECT_VERSION_VALUE {
69            let err_msg = format!(
70                "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
71            );
72            return Err(ArrowError::InvalidArgumentError(err_msg));
73        }
74        let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
75        let offset_size_minus_one = header_byte >> 6; // Last two bits
76        Ok(Self {
77            version,
78            is_sorted,
79            offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
80        })
81    }
82}
83
84/// [`Variant`] Metadata
85///
86/// See the [Variant Spec] file for more information
87///
88/// # Validation
89///
90/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
91/// underlying bytes are a valid encoding of variant metadata (see below).
92///
93/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
94/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
95/// validation cost is linear in the number of underlying bytes.
96///
97/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
98/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
99/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
100/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
101/// _unvalidated_ instance, if desired.
102///
103/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
104/// knows the underlying bytes were already validated previously, or if the caller intends to
105/// perform a small number of (fallible) accesses to a large dictionary.
106///
107/// A _validated_ variant [metadata instance guarantees that:
108///
109/// - header byte is valid
110/// - dictionary size is in bounds
111/// - offset array content is in-bounds
112/// - first offset is zero
113/// - last offset is in-bounds
114/// - all other offsets are in-bounds (*)
115/// - all offsets are monotonically increasing (*)
116/// - all values are valid utf-8 (*)
117///
118/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
119/// in the list above); it panics any of the other checks fails.
120///
121/// # Safety
122///
123/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
124/// it with infallible methods may cause panics but will never lead to undefined behavior.
125///
126/// [`Variant`]: crate::Variant
127/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
128#[derive(Debug, Clone, PartialEq)]
129pub struct VariantMetadata<'m> {
130    pub(crate) bytes: &'m [u8],
131    header: VariantMetadataHeader,
132    dictionary_size: u32,
133    first_value_byte: u32,
134    validated: bool,
135}
136
137// We don't want this to grow because it increases the size of VariantList and VariantObject, which
138// could increase the size of Variant. All those size increases could hurt performance.
139const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
140
141impl<'m> VariantMetadata<'m> {
142    /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
143    /// dictionary entries.
144    ///
145    /// [validation]: Self#Validation
146    pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
147        Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
148    }
149
150    /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
151    /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
152    /// indexing and iteration could also panic if the underlying bytes are invalid.
153    ///
154    /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
155    /// were already validated previously by other means, or if the caller expects a small number of
156    /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
157    /// needed, instead of paying expensive full validation up front).
158    ///
159    /// [validate]: Self#Validation
160    pub fn new(bytes: &'m [u8]) -> Self {
161        Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
162    }
163
164    // The actual constructor, which performs only basic (constant-const) validation.
165    pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
166        let header_byte = first_byte_from_slice(bytes)?;
167        let header = VariantMetadataHeader::try_new(header_byte)?;
168
169        // First element after header is dictionary size; the offset array immediately follows.
170        let dictionary_size =
171            header
172                .offset_size
173                .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
174
175        // Calculate the starting offset of the dictionary string bytes.
176        //
177        // There are dict_size + 1 offsets, and the value bytes immediately follow
178        // = (dict_size + 1) * offset_size + header.first_offset_byte()
179        let first_value_byte = dictionary_size
180            .checked_add(1)
181            .and_then(|n| n.checked_mul(header.offset_size()))
182            .and_then(|n| n.checked_add(header.first_offset_byte()))
183            .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
184
185        let mut new_self = Self {
186            bytes,
187            header,
188            dictionary_size,
189            first_value_byte,
190            validated: false,
191        };
192
193        // Validate just the first and last offset, ignoring the other offsets and all value bytes.
194        let first_offset = new_self.get_offset(0)?;
195        if first_offset != 0 {
196            return Err(ArrowError::InvalidArgumentError(format!(
197                "First offset is not zero: {first_offset}"
198            )));
199        }
200
201        // Use the last offset to upper-bound the byte slice
202        let last_offset = new_self
203            .get_offset(dictionary_size as _)?
204            .checked_add(first_value_byte)
205            .ok_or_else(|| overflow_error("variant metadata size"))?;
206        new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
207        Ok(new_self)
208    }
209
210    /// The number of metadata dictionary entries
211    pub fn len(&self) -> usize {
212        self.dictionary_size()
213    }
214
215    /// True if this metadata dictionary contains no entries
216    pub fn is_empty(&self) -> bool {
217        self.len() == 0
218    }
219
220    /// True if this instance is fully [validated] for panic-free infallible accesses.
221    ///
222    /// [validated]: Self#Validation
223    pub fn is_fully_validated(&self) -> bool {
224        self.validated
225    }
226
227    /// Performs a full [validation] of this metadata dictionary and returns the result.
228    ///
229    /// [validation]: Self#Validation
230    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
231        if !self.validated {
232            let offset_bytes = slice_from_slice(
233                self.bytes,
234                self.header.first_offset_byte() as _..self.first_value_byte as _,
235            )?;
236
237            let offsets =
238                map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>();
239
240            // Verify the string values in the dictionary are UTF-8 encoded strings.
241            let value_buffer =
242                string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
243
244            if self.header.is_sorted {
245                // Validate the dictionary values are unique and lexicographically sorted
246                //
247                // Since we use the offsets to access dictionary values, this also validates
248                // offsets are in-bounds and monotonically increasing
249                let are_dictionary_values_unique_and_sorted = (1..offsets.len())
250                    .map(|i| {
251                        let field_range = offsets[i - 1]..offsets[i];
252                        value_buffer.get(field_range)
253                    })
254                    .is_sorted_by(|a, b| match (a, b) {
255                        (Some(a), Some(b)) => a < b,
256                        _ => false,
257                    });
258
259                if !are_dictionary_values_unique_and_sorted {
260                    return Err(ArrowError::InvalidArgumentError(
261                        "dictionary values are not unique and ordered".to_string(),
262                    ));
263                }
264            } else {
265                // Validate offsets are in-bounds and monotonically increasing
266                //
267                // Since shallow validation ensures the first and last offsets are in bounds,
268                // we can also verify all offsets are in-bounds by checking if
269                // offsets are monotonically increasing
270                let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
271                if !are_offsets_monotonic {
272                    return Err(ArrowError::InvalidArgumentError(
273                        "offsets not monotonically increasing".to_string(),
274                    ));
275                }
276            }
277
278            self.validated = true;
279        }
280        Ok(self)
281    }
282
283    /// Whether the dictionary keys are sorted and unique
284    pub fn is_sorted(&self) -> bool {
285        self.header.is_sorted
286    }
287
288    /// Get the dictionary size
289    pub const fn dictionary_size(&self) -> usize {
290        self.dictionary_size as _
291    }
292
293    /// The variant protocol version
294    pub const fn version(&self) -> u8 {
295        self.header.version
296    }
297
298    /// Gets an offset array entry by index.
299    ///
300    /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
301    /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
302    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
303        let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
304        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
305        self.header.offset_size.unpack_u32(bytes, i)
306    }
307
308    /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
309    /// underlying bytes are [invalid].
310    ///
311    /// [invalid]: Self#Validation
312    pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
313        let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
314        string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
315    }
316
317    /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
318    /// iterator encounters [invalid] data.
319    ///
320    /// [invalid]: Self#Validation
321    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
322        (0..self.len()).map(|i| self.get(i))
323    }
324
325    /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
326    /// [`Self::iter_try`] to avoid panics due to invalid data.
327    ///
328    /// [unvalidated]: Self#Validation
329    pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
330        self.iter_try()
331            .map(|result| result.expect("Invalid metadata dictionary entry"))
332    }
333}
334
335/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
336/// [unvalidated] input could also panic if the underlying bytes are invalid.
337///
338/// [unvalidated]: Self#Validation
339impl std::ops::Index<usize> for VariantMetadata<'_> {
340    type Output = str;
341
342    fn index(&self, i: usize) -> &str {
343        self.get(i).expect("Invalid metadata dictionary entry")
344    }
345}
346
347#[cfg(test)]
348mod tests {
349    use super::*;
350
351    /// `"cat"`, `"dog"` – valid metadata
352    #[test]
353    fn try_new_ok_inline() {
354        let bytes = &[
355            0b0000_0001, // header, offset_size_minus_one=0 and version=1
356            0x02,        // dictionary_size (2 strings)
357            0x00,
358            0x03,
359            0x06,
360            b'c',
361            b'a',
362            b't',
363            b'd',
364            b'o',
365            b'g',
366        ];
367
368        let md = VariantMetadata::try_new(bytes).expect("should parse");
369        assert_eq!(md.dictionary_size(), 2);
370        // Fields
371        assert_eq!(&md[0], "cat");
372        assert_eq!(&md[1], "dog");
373
374        // Offsets
375        assert_eq!(md.get_offset(0).unwrap(), 0x00);
376        assert_eq!(md.get_offset(1).unwrap(), 0x03);
377        assert_eq!(md.get_offset(2).unwrap(), 0x06);
378
379        let err = md.get_offset(3).unwrap_err();
380        assert!(
381            matches!(err, ArrowError::InvalidArgumentError(_)),
382            "unexpected error: {err:?}"
383        );
384
385        let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
386        assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
387    }
388
389    /// Too short buffer test (missing one required offset).
390    /// Should error with "metadata shorter than dictionary_size implies".
391    #[test]
392    fn try_new_missing_last_value() {
393        let bytes = &[
394            0b0000_0001, // header, offset_size_minus_one=0 and version=1
395            0x02,        // dictionary_size = 2
396            0x00,
397            0x01,
398            0x02,
399            b'a',
400            b'b', // <-- we'll remove this
401        ];
402
403        let working_md = VariantMetadata::try_new(bytes).expect("should parse");
404        assert_eq!(working_md.dictionary_size(), 2);
405        assert_eq!(&working_md[0], "a");
406        assert_eq!(&working_md[1], "b");
407
408        let truncated = &bytes[..bytes.len() - 1];
409
410        let err = VariantMetadata::try_new(truncated).unwrap_err();
411        assert!(
412            matches!(err, ArrowError::InvalidArgumentError(_)),
413            "unexpected error: {err:?}"
414        );
415    }
416
417    #[test]
418    fn try_new_fails_non_monotonic() {
419        // 'cat', 'dog', 'lamb'
420        let bytes = &[
421            0b0000_0001, // header, offset_size_minus_one=0 and version=1
422            0x03,        // dictionary_size
423            0x00,
424            0x02,
425            0x01, // Doesn't increase monotonically
426            0x10,
427            b'c',
428            b'a',
429            b't',
430            b'd',
431            b'o',
432            b'g',
433            b'l',
434            b'a',
435            b'm',
436            b'b',
437        ];
438
439        let err = VariantMetadata::try_new(bytes).unwrap_err();
440        assert!(
441            matches!(err, ArrowError::InvalidArgumentError(_)),
442            "unexpected error: {err:?}"
443        );
444    }
445
446    #[test]
447    fn try_new_fails_non_monotonic2() {
448        // this test case checks whether offsets are monotonic in the full validation logic.
449
450        // 'cat', 'dog', 'lamb', "eel"
451        let bytes = &[
452            0b0000_0001, // header, offset_size_minus_one=0 and version=1
453            4,           // dictionary_size
454            0x00,
455            0x02,
456            0x01, // Doesn't increase monotonically
457            0x10,
458            13,
459            b'c',
460            b'a',
461            b't',
462            b'd',
463            b'o',
464            b'g',
465            b'l',
466            b'a',
467            b'm',
468            b'b',
469            b'e',
470            b'e',
471            b'l',
472        ];
473
474        let err = VariantMetadata::try_new(bytes).unwrap_err();
475
476        assert!(
477            matches!(err, ArrowError::InvalidArgumentError(_)),
478            "unexpected error: {err:?}"
479        );
480    }
481
482    #[test]
483    fn try_new_truncated_offsets_inline() {
484        // Missing final offset
485        let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
486
487        let err = VariantMetadata::try_new(bytes).unwrap_err();
488        assert!(
489            matches!(err, ArrowError::InvalidArgumentError(_)),
490            "unexpected error: {err:?}"
491        );
492    }
493}