parquet_variant/variant/metadata.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{OffsetSizeBytes, map_bytes_to_offsets};
19use crate::utils::{
20 first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21 try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29 version: u8,
30 is_sorted: bool,
31 /// Note: This is `offset_size_minus_one` + 1
32 offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43 // Hide the cast
44 const fn offset_size(&self) -> u32 {
45 self.offset_size as u32
46 }
47
48 // Avoid materializing this offset, since it's cheaply and safely computable
49 const fn first_offset_byte(&self) -> u32 {
50 NUM_HEADER_BYTES + self.offset_size()
51 }
52
53 /// Tries to construct the variant metadata header, which has the form
54 ///
55 /// ```text
56 /// 7 6 5 4 3 0
57 /// +-------+---+---+---------------+
58 /// header | | | | version |
59 /// +-------+---+---+---------------+
60 /// ^ ^
61 /// | +-- sorted_strings
62 /// +-- offset_size_minus_one
63 /// ```
64 ///
65 /// The version is a 4-bit value that must always contain the value 1.
66 /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67 /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68 /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69 pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70 let version = header_byte & 0x0F; // First four bits
71 if version != CORRECT_VERSION_VALUE {
72 let err_msg = format!(
73 "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74 );
75 return Err(ArrowError::InvalidArgumentError(err_msg));
76 }
77 let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78 let offset_size_minus_one = header_byte >> 6; // Last two bits
79 Ok(Self {
80 version,
81 is_sorted,
82 offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83 })
84 }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133 /// (Only) the bytes that make up this metadata instance.
134 pub(crate) bytes: &'m [u8],
135 header: VariantMetadataHeader,
136 dictionary_size: u32,
137 first_value_byte: u32,
138 validated: bool,
139}
140
141// We don't want this to grow because it increases the size of VariantList and VariantObject, which
142// could increase the size of Variant. All those size increases could hurt performance.
143const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
144
145/// The canonical byte slice corresponding to an empty metadata dictionary.
146///
147/// ```
148/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
149/// let mut metadata_builder = WritableMetadataBuilder::default();
150/// metadata_builder.finish();
151/// let metadata_bytes = metadata_builder.into_inner();
152/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
153/// ```
154pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];
155
156/// The empty metadata dictionary.
157///
158/// ```
159/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
160/// let mut metadata_builder = WritableMetadataBuilder::default();
161/// metadata_builder.finish();
162/// let metadata_bytes = metadata_builder.into_inner();
163/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
164/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
165/// ```
166pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
167 bytes: EMPTY_VARIANT_METADATA_BYTES,
168 header: VariantMetadataHeader {
169 version: CORRECT_VERSION_VALUE,
170 is_sorted: false,
171 offset_size: OffsetSizeBytes::One,
172 },
173 dictionary_size: 0,
174 first_value_byte: 3,
175 validated: true,
176};
177
178impl<'m> VariantMetadata<'m> {
179 /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
180 /// dictionary entries.
181 ///
182 /// [validation]: Self#Validation
183 pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
184 Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
185 }
186
187 /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
188 /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
189 /// indexing and iteration could also panic if the underlying bytes are invalid.
190 ///
191 /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
192 /// were already validated previously by other means, or if the caller expects a small number of
193 /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
194 /// needed, instead of paying expensive full validation up front).
195 ///
196 /// [validate]: Self#Validation
197 pub fn new(bytes: &'m [u8]) -> Self {
198 Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
199 }
200
201 // The actual constructor, which performs only basic (constant-const) validation.
202 pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
203 let header_byte = first_byte_from_slice(bytes)?;
204 let header = VariantMetadataHeader::try_new(header_byte)?;
205
206 // First element after header is dictionary size; the offset array immediately follows.
207 let dictionary_size =
208 header
209 .offset_size
210 .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
211
212 // Calculate the starting offset of the dictionary string bytes.
213 //
214 // There are dict_size + 1 offsets, and the value bytes immediately follow
215 // = (dict_size + 1) * offset_size + header.first_offset_byte()
216 let first_value_byte = dictionary_size
217 .checked_add(1)
218 .and_then(|n| n.checked_mul(header.offset_size()))
219 .and_then(|n| n.checked_add(header.first_offset_byte()))
220 .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
221
222 let mut new_self = Self {
223 bytes,
224 header,
225 dictionary_size,
226 first_value_byte,
227 validated: false,
228 };
229
230 // Validate just the first and last offset, ignoring the other offsets and all value bytes.
231 let first_offset = new_self.get_offset(0)?;
232 if first_offset != 0 {
233 return Err(ArrowError::InvalidArgumentError(format!(
234 "First offset is not zero: {first_offset}"
235 )));
236 }
237
238 // Use the last offset to upper-bound the byte slice
239 let last_offset = new_self
240 .get_offset(dictionary_size as _)?
241 .checked_add(first_value_byte)
242 .ok_or_else(|| overflow_error("variant metadata size"))?;
243 new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
244 Ok(new_self)
245 }
246
247 /// The number of metadata dictionary entries
248 pub fn len(&self) -> usize {
249 self.dictionary_size as _
250 }
251
252 /// True if this metadata dictionary contains no entries
253 pub fn is_empty(&self) -> bool {
254 self.len() == 0
255 }
256
257 /// True if this instance is fully [validated] for panic-free infallible accesses.
258 ///
259 /// [validated]: Self#Validation
260 pub fn is_fully_validated(&self) -> bool {
261 self.validated
262 }
263
264 /// Performs a full [validation] of this metadata dictionary and returns the result.
265 ///
266 /// [validation]: Self#Validation
267 pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
268 if !self.validated {
269 let offset_bytes = slice_from_slice(
270 self.bytes,
271 self.header.first_offset_byte() as _..self.first_value_byte as _,
272 )?;
273
274 // Verify the string values in the dictionary are UTF-8 encoded strings.
275 let value_buffer =
276 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
277
278 let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
279
280 if self.header.is_sorted {
281 // Validate the dictionary values are unique and lexicographically sorted
282 //
283 // Since we use the offsets to access dictionary values, this also validates
284 // offsets are in-bounds and monotonically increasing
285 let mut current_offset = offsets.next().unwrap_or(0);
286 let mut prev_value: Option<&str> = None;
287 for next_offset in offsets {
288 let current_value = value_buffer.get(current_offset..next_offset).ok_or_else(
289 || {
290 ArrowError::InvalidArgumentError(format!(
291 "range {current_offset}..{next_offset} is invalid or out of bounds"
292 ))
293 },
294 )?;
295
296 if let Some(prev_val) = prev_value {
297 if current_value <= prev_val {
298 return Err(ArrowError::InvalidArgumentError(
299 "dictionary values are not unique and ordered".to_string(),
300 ));
301 }
302 }
303
304 prev_value = Some(current_value);
305 current_offset = next_offset;
306 }
307 } else {
308 // Validate offsets are in-bounds and monotonically increasing
309 //
310 // Since shallow validation ensures the first and last offsets are in bounds,
311 // we can also verify all offsets are in-bounds by checking if
312 // offsets are monotonically increasing
313 if !offsets.is_sorted_by(|a, b| a < b) {
314 return Err(ArrowError::InvalidArgumentError(
315 "offsets not monotonically increasing".to_string(),
316 ));
317 }
318 }
319
320 self.validated = true;
321 }
322 Ok(self)
323 }
324
325 /// Whether the dictionary keys are sorted and unique
326 pub fn is_sorted(&self) -> bool {
327 self.header.is_sorted
328 }
329
330 /// The variant protocol version
331 pub const fn version(&self) -> u8 {
332 self.header.version
333 }
334
335 /// Gets an offset into the dictionary entry by index.
336 ///
337 /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
338 /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
339 fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
340 let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
341 let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
342 self.header.offset_size.unpack_u32(bytes, i)
343 }
344
345 /// Returns the total size, in bytes, of the metadata.
346 ///
347 /// Note this value may be smaller than what was passed to [`Self::new`] or
348 /// [`Self::try_new`] if the input was larger than necessary to encode the
349 /// metadata dictionary.
350 pub fn size(&self) -> usize {
351 self.bytes.len()
352 }
353
354 /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
355 /// underlying bytes are [invalid].
356 ///
357 /// [invalid]: Self#Validation
358 pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
359 let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
360 string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
361 }
362
363 // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
364 // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
365 // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
366 fn get_impl(&self, i: usize) -> &'m str {
367 self.get(i).expect("Invalid metadata dictionary entry")
368 }
369
370 /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
371 /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
372 /// otherwise.
373 ///
374 /// WARNING: This method panics if the underlying bytes are [invalid].
375 ///
376 /// [invalid]: Self#Validation
377 pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
378 let field_id = if self.is_sorted() && self.len() > 10 {
379 // Binary search is faster for a not-tiny sorted metadata dictionary
380 let cmp = |i| Some(self.get_impl(i).cmp(field_name));
381 try_binary_search_range_by(0..self.len(), cmp)?.ok()?
382 } else {
383 // Fall back to Linear search for tiny or unsorted dictionary
384 (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
385 };
386 Some((field_id as u32, self.get_impl(field_id)))
387 }
388
389 /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
390 /// iterator encounters [invalid] data.
391 ///
392 /// [invalid]: Self#Validation
393 pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
394 (0..self.len()).map(|i| self.get(i))
395 }
396
397 /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
398 /// [`Self::iter_try`] to avoid panics due to invalid data.
399 ///
400 /// [unvalidated]: Self#Validation
401 pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
402 self.iter_try()
403 .map(|result| result.expect("Invalid metadata dictionary entry"))
404 }
405}
406
407/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
408/// [unvalidated] input could also panic if the underlying bytes are invalid.
409///
410/// [unvalidated]: Self#Validation
411impl std::ops::Index<usize> for VariantMetadata<'_> {
412 type Output = str;
413
414 fn index(&self, i: usize) -> &str {
415 self.get_impl(i)
416 }
417}
418
419#[cfg(test)]
420mod tests {
421
422 use crate::VariantBuilder;
423
424 use super::*;
425
426 /// `"cat"`, `"dog"` – valid metadata
427 #[test]
428 fn try_new_ok_inline() {
429 let bytes = &[
430 0b0000_0001, // header, offset_size_minus_one=0 and version=1
431 0x02, // dictionary_size (2 strings)
432 0x00,
433 0x03,
434 0x06,
435 b'c',
436 b'a',
437 b't',
438 b'd',
439 b'o',
440 b'g',
441 ];
442
443 let md = VariantMetadata::try_new(bytes).expect("should parse");
444 assert_eq!(md.len(), 2);
445 // Fields
446 assert_eq!(&md[0], "cat");
447 assert_eq!(&md[1], "dog");
448
449 // Offsets
450 assert_eq!(md.get_offset(0).unwrap(), 0x00);
451 assert_eq!(md.get_offset(1).unwrap(), 0x03);
452 assert_eq!(md.get_offset(2).unwrap(), 0x06);
453
454 let err = md.get_offset(3).unwrap_err();
455 assert!(
456 matches!(err, ArrowError::InvalidArgumentError(_)),
457 "unexpected error: {err:?}"
458 );
459
460 let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
461 assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
462 }
463
464 /// Too short buffer test (missing one required offset).
465 /// Should error with "metadata shorter than dictionary_size implies".
466 #[test]
467 fn try_new_missing_last_value() {
468 let bytes = &[
469 0b0000_0001, // header, offset_size_minus_one=0 and version=1
470 0x02, // dictionary_size = 2
471 0x00,
472 0x01,
473 0x02,
474 b'a',
475 b'b', // <-- we'll remove this
476 ];
477
478 let working_md = VariantMetadata::try_new(bytes).expect("should parse");
479 assert_eq!(working_md.len(), 2);
480 assert_eq!(&working_md[0], "a");
481 assert_eq!(&working_md[1], "b");
482
483 let truncated = &bytes[..bytes.len() - 1];
484
485 let err = VariantMetadata::try_new(truncated).unwrap_err();
486 assert!(
487 matches!(err, ArrowError::InvalidArgumentError(_)),
488 "unexpected error: {err:?}"
489 );
490 }
491
492 #[test]
493 fn try_new_fails_non_monotonic() {
494 // 'cat', 'dog', 'lamb'
495 let bytes = &[
496 0b0000_0001, // header, offset_size_minus_one=0 and version=1
497 0x03, // dictionary_size
498 0x00,
499 0x02,
500 0x01, // Doesn't increase monotonically
501 0x10,
502 b'c',
503 b'a',
504 b't',
505 b'd',
506 b'o',
507 b'g',
508 b'l',
509 b'a',
510 b'm',
511 b'b',
512 ];
513
514 let err = VariantMetadata::try_new(bytes).unwrap_err();
515 assert!(
516 matches!(err, ArrowError::InvalidArgumentError(_)),
517 "unexpected error: {err:?}"
518 );
519 }
520
521 #[test]
522 fn try_new_fails_non_monotonic2() {
523 // this test case checks whether offsets are monotonic in the full validation logic.
524
525 // 'cat', 'dog', 'lamb', "eel"
526 let bytes = &[
527 0b0000_0001, // header, offset_size_minus_one=0 and version=1
528 4, // dictionary_size
529 0x00,
530 0x02,
531 0x01, // Doesn't increase monotonically
532 0x10,
533 13,
534 b'c',
535 b'a',
536 b't',
537 b'd',
538 b'o',
539 b'g',
540 b'l',
541 b'a',
542 b'm',
543 b'b',
544 b'e',
545 b'e',
546 b'l',
547 ];
548
549 let err = VariantMetadata::try_new(bytes).unwrap_err();
550
551 assert!(
552 matches!(err, ArrowError::InvalidArgumentError(_)),
553 "unexpected error: {err:?}"
554 );
555 }
556
557 #[test]
558 fn try_new_truncated_offsets_inline() {
559 // Missing final offset
560 let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
561
562 let err = VariantMetadata::try_new(bytes).unwrap_err();
563 assert!(
564 matches!(err, ArrowError::InvalidArgumentError(_)),
565 "unexpected error: {err:?}"
566 );
567 }
568
569 #[test]
570 fn empty_string_is_valid() {
571 let bytes = &[
572 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
573 1,
574 0x00,
575 0x00,
576 ];
577 let metadata = VariantMetadata::try_new(bytes).unwrap();
578 assert_eq!(&metadata[0], "");
579
580 let bytes = &[
581 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
582 2,
583 0x00,
584 0x00,
585 0x02,
586 b'h',
587 b'i',
588 ];
589 let metadata = VariantMetadata::try_new(bytes).unwrap();
590 assert_eq!(&metadata[0], "");
591 assert_eq!(&metadata[1], "hi");
592
593 let bytes = &[
594 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
595 2,
596 0x00,
597 0x02,
598 0x02, // empty string is allowed, but must be first in a sorted dict
599 b'h',
600 b'i',
601 ];
602 let err = VariantMetadata::try_new(bytes).unwrap_err();
603 assert!(
604 matches!(err, ArrowError::InvalidArgumentError(_)),
605 "unexpected error: {err:?}"
606 );
607 }
608
609 #[test]
610 fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
611 // create a sorted object
612 let mut b = VariantBuilder::new();
613 let mut o = b.new_object();
614
615 o.insert("a", false);
616 o.insert("b", false);
617
618 o.finish();
619
620 let (m, _) = b.finish();
621
622 let m1 = VariantMetadata::new(&m);
623 assert!(m1.is_sorted());
624
625 // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
626 // Since field names are not unique, it is considered not sorted.
627 let metadata_bytes = vec![
628 0b0000_0001,
629 3, // dictionary size
630 0, // "a"
631 1, // "a"
632 2, // "b"
633 3,
634 b'a',
635 b'a',
636 b'b',
637 ];
638 let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
639 assert!(!m2.is_sorted());
640
641 assert_ne!(m1, m2);
642 }
643
644 #[test]
645 fn test_compare_sorted_dictionary_with_sorted_dictionary() {
646 // create a sorted object
647 let mut b = VariantBuilder::new();
648 let mut o = b.new_object();
649
650 o.insert("a", false);
651 o.insert("b", false);
652
653 o.finish();
654
655 let (m, _) = b.finish();
656
657 let m1 = VariantMetadata::new(&m);
658 let m2 = VariantMetadata::new(&m);
659
660 assert_eq!(m1, m2);
661 }
662}