parquet_variant/variant/metadata.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{
20 first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21 try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29 version: u8,
30 is_sorted: bool,
31 /// Note: This is `offset_size_minus_one` + 1
32 offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43 // Hide the cast
44 const fn offset_size(&self) -> u32 {
45 self.offset_size as u32
46 }
47
48 // Avoid materializing this offset, since it's cheaply and safely computable
49 const fn first_offset_byte(&self) -> u32 {
50 NUM_HEADER_BYTES + self.offset_size()
51 }
52
53 /// Tries to construct the variant metadata header, which has the form
54 ///
55 /// ```text
56 /// 7 6 5 4 3 0
57 /// +-------+---+---+---------------+
58 /// header | | | | version |
59 /// +-------+---+---+---------------+
60 /// ^ ^
61 /// | +-- sorted_strings
62 /// +-- offset_size_minus_one
63 /// ```
64 ///
65 /// The version is a 4-bit value that must always contain the value 1.
66 /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67 /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68 /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69 pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70 let version = header_byte & 0x0F; // First four bits
71 if version != CORRECT_VERSION_VALUE {
72 let err_msg = format!(
73 "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74 );
75 return Err(ArrowError::InvalidArgumentError(err_msg));
76 }
77 let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78 let offset_size_minus_one = header_byte >> 6; // Last two bits
79 Ok(Self {
80 version,
81 is_sorted,
82 offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83 })
84 }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133 /// (Only) the bytes that make up this metadata instance.
134 pub(crate) bytes: &'m [u8],
135 header: VariantMetadataHeader,
136 dictionary_size: u32,
137 first_value_byte: u32,
138 validated: bool,
139}
140
141// We don't want this to grow because it increases the size of VariantList and VariantObject, which
142// could increase the size of Variant. All those size increases could hurt performance.
143const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
144
145/// The canonical byte slice corresponding to an empty metadata dictionary.
146///
147/// ```
148/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
149/// let mut metadata_builder = WritableMetadataBuilder::default();
150/// metadata_builder.finish();
151/// let metadata_bytes = metadata_builder.into_inner();
152/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
153/// ```
154pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];
155
156/// The empty metadata dictionary.
157///
158/// ```
159/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
160/// let mut metadata_builder = WritableMetadataBuilder::default();
161/// metadata_builder.finish();
162/// let metadata_bytes = metadata_builder.into_inner();
163/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
164/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
165/// ```
166pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
167 bytes: EMPTY_VARIANT_METADATA_BYTES,
168 header: VariantMetadataHeader {
169 version: CORRECT_VERSION_VALUE,
170 is_sorted: false,
171 offset_size: OffsetSizeBytes::One,
172 },
173 dictionary_size: 0,
174 first_value_byte: 3,
175 validated: true,
176};
177
178impl<'m> VariantMetadata<'m> {
179 /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
180 /// dictionary entries.
181 ///
182 /// [validation]: Self#Validation
183 pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
184 Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
185 }
186
187 /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
188 /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
189 /// indexing and iteration could also panic if the underlying bytes are invalid.
190 ///
191 /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
192 /// were already validated previously by other means, or if the caller expects a small number of
193 /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
194 /// needed, instead of paying expensive full validation up front).
195 ///
196 /// [validate]: Self#Validation
197 pub fn new(bytes: &'m [u8]) -> Self {
198 Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
199 }
200
201 // The actual constructor, which performs only basic (constant-const) validation.
202 pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
203 let header_byte = first_byte_from_slice(bytes)?;
204 let header = VariantMetadataHeader::try_new(header_byte)?;
205
206 // First element after header is dictionary size; the offset array immediately follows.
207 let dictionary_size =
208 header
209 .offset_size
210 .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
211
212 // Calculate the starting offset of the dictionary string bytes.
213 //
214 // There are dict_size + 1 offsets, and the value bytes immediately follow
215 // = (dict_size + 1) * offset_size + header.first_offset_byte()
216 let first_value_byte = dictionary_size
217 .checked_add(1)
218 .and_then(|n| n.checked_mul(header.offset_size()))
219 .and_then(|n| n.checked_add(header.first_offset_byte()))
220 .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
221
222 let mut new_self = Self {
223 bytes,
224 header,
225 dictionary_size,
226 first_value_byte,
227 validated: false,
228 };
229
230 // Validate just the first and last offset, ignoring the other offsets and all value bytes.
231 let first_offset = new_self.get_offset(0)?;
232 if first_offset != 0 {
233 return Err(ArrowError::InvalidArgumentError(format!(
234 "First offset is not zero: {first_offset}"
235 )));
236 }
237
238 // Use the last offset to upper-bound the byte slice
239 let last_offset = new_self
240 .get_offset(dictionary_size as _)?
241 .checked_add(first_value_byte)
242 .ok_or_else(|| overflow_error("variant metadata size"))?;
243 new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
244 Ok(new_self)
245 }
246
247 /// The number of metadata dictionary entries
248 pub fn len(&self) -> usize {
249 self.dictionary_size as _
250 }
251
252 /// True if this metadata dictionary contains no entries
253 pub fn is_empty(&self) -> bool {
254 self.len() == 0
255 }
256
257 /// True if this instance is fully [validated] for panic-free infallible accesses.
258 ///
259 /// [validated]: Self#Validation
260 pub fn is_fully_validated(&self) -> bool {
261 self.validated
262 }
263
264 /// Performs a full [validation] of this metadata dictionary and returns the result.
265 ///
266 /// [validation]: Self#Validation
267 pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
268 if !self.validated {
269 let offset_bytes = slice_from_slice(
270 self.bytes,
271 self.header.first_offset_byte() as _..self.first_value_byte as _,
272 )?;
273
274 // Verify the string values in the dictionary are UTF-8 encoded strings.
275 let value_buffer =
276 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
277
278 let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
279
280 if self.header.is_sorted {
281 // Validate the dictionary values are unique and lexicographically sorted
282 //
283 // Since we use the offsets to access dictionary values, this also validates
284 // offsets are in-bounds and monotonically increasing
285 let mut current_offset = offsets.next().unwrap_or(0);
286 let mut prev_value: Option<&str> = None;
287 for next_offset in offsets {
288 let current_value =
289 value_buffer
290 .get(current_offset..next_offset)
291 .ok_or_else(|| {
292 ArrowError::InvalidArgumentError(format!(
293 "range {current_offset}..{next_offset} is invalid or out of bounds"
294 ))
295 })?;
296
297 if let Some(prev_val) = prev_value {
298 if current_value <= prev_val {
299 return Err(ArrowError::InvalidArgumentError(
300 "dictionary values are not unique and ordered".to_string(),
301 ));
302 }
303 }
304
305 prev_value = Some(current_value);
306 current_offset = next_offset;
307 }
308 } else {
309 // Validate offsets are in-bounds and monotonically increasing
310 //
311 // Since shallow validation ensures the first and last offsets are in bounds,
312 // we can also verify all offsets are in-bounds by checking if
313 // offsets are monotonically increasing
314 if !offsets.is_sorted_by(|a, b| a < b) {
315 return Err(ArrowError::InvalidArgumentError(
316 "offsets not monotonically increasing".to_string(),
317 ));
318 }
319 }
320
321 self.validated = true;
322 }
323 Ok(self)
324 }
325
326 /// Whether the dictionary keys are sorted and unique
327 pub fn is_sorted(&self) -> bool {
328 self.header.is_sorted
329 }
330
331 /// The variant protocol version
332 pub const fn version(&self) -> u8 {
333 self.header.version
334 }
335
336 /// Gets an offset into the dictionary entry by index.
337 ///
338 /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
339 /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
340 fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
341 let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
342 let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
343 self.header.offset_size.unpack_u32(bytes, i)
344 }
345
346 /// Returns the total size, in bytes, of the metadata.
347 ///
348 /// Note this value may be smaller than what was passed to [`Self::new`] or
349 /// [`Self::try_new`] if the input was larger than necessary to encode the
350 /// metadata dictionary.
351 pub fn size(&self) -> usize {
352 self.bytes.len()
353 }
354
355 /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
356 /// underlying bytes are [invalid].
357 ///
358 /// [invalid]: Self#Validation
359 pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
360 let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
361 string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
362 }
363
364 // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
365 // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
366 // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
367 fn get_impl(&self, i: usize) -> &'m str {
368 self.get(i).expect("Invalid metadata dictionary entry")
369 }
370
371 /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
372 /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
373 /// otherwise.
374 ///
375 /// WARNING: This method panics if the underlying bytes are [invalid].
376 ///
377 /// [invalid]: Self#Validation
378 pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
379 let field_id = if self.is_sorted() && self.len() > 10 {
380 // Binary search is faster for a not-tiny sorted metadata dictionary
381 let cmp = |i| Some(self.get_impl(i).cmp(field_name));
382 try_binary_search_range_by(0..self.len(), cmp)?.ok()?
383 } else {
384 // Fall back to Linear search for tiny or unsorted dictionary
385 (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
386 };
387 Some((field_id as u32, self.get_impl(field_id)))
388 }
389
390 /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
391 /// iterator encounters [invalid] data.
392 ///
393 /// [invalid]: Self#Validation
394 pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
395 (0..self.len()).map(|i| self.get(i))
396 }
397
398 /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
399 /// [`Self::iter_try`] to avoid panics due to invalid data.
400 ///
401 /// [unvalidated]: Self#Validation
402 pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
403 self.iter_try()
404 .map(|result| result.expect("Invalid metadata dictionary entry"))
405 }
406}
407
408/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
409/// [unvalidated] input could also panic if the underlying bytes are invalid.
410///
411/// [unvalidated]: Self#Validation
412impl std::ops::Index<usize> for VariantMetadata<'_> {
413 type Output = str;
414
415 fn index(&self, i: usize) -> &str {
416 self.get_impl(i)
417 }
418}
419
420#[cfg(test)]
421mod tests {
422
423 use crate::VariantBuilder;
424
425 use super::*;
426
427 /// `"cat"`, `"dog"` – valid metadata
428 #[test]
429 fn try_new_ok_inline() {
430 let bytes = &[
431 0b0000_0001, // header, offset_size_minus_one=0 and version=1
432 0x02, // dictionary_size (2 strings)
433 0x00,
434 0x03,
435 0x06,
436 b'c',
437 b'a',
438 b't',
439 b'd',
440 b'o',
441 b'g',
442 ];
443
444 let md = VariantMetadata::try_new(bytes).expect("should parse");
445 assert_eq!(md.len(), 2);
446 // Fields
447 assert_eq!(&md[0], "cat");
448 assert_eq!(&md[1], "dog");
449
450 // Offsets
451 assert_eq!(md.get_offset(0).unwrap(), 0x00);
452 assert_eq!(md.get_offset(1).unwrap(), 0x03);
453 assert_eq!(md.get_offset(2).unwrap(), 0x06);
454
455 let err = md.get_offset(3).unwrap_err();
456 assert!(
457 matches!(err, ArrowError::InvalidArgumentError(_)),
458 "unexpected error: {err:?}"
459 );
460
461 let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
462 assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
463 }
464
465 /// Too short buffer test (missing one required offset).
466 /// Should error with "metadata shorter than dictionary_size implies".
467 #[test]
468 fn try_new_missing_last_value() {
469 let bytes = &[
470 0b0000_0001, // header, offset_size_minus_one=0 and version=1
471 0x02, // dictionary_size = 2
472 0x00,
473 0x01,
474 0x02,
475 b'a',
476 b'b', // <-- we'll remove this
477 ];
478
479 let working_md = VariantMetadata::try_new(bytes).expect("should parse");
480 assert_eq!(working_md.len(), 2);
481 assert_eq!(&working_md[0], "a");
482 assert_eq!(&working_md[1], "b");
483
484 let truncated = &bytes[..bytes.len() - 1];
485
486 let err = VariantMetadata::try_new(truncated).unwrap_err();
487 assert!(
488 matches!(err, ArrowError::InvalidArgumentError(_)),
489 "unexpected error: {err:?}"
490 );
491 }
492
493 #[test]
494 fn try_new_fails_non_monotonic() {
495 // 'cat', 'dog', 'lamb'
496 let bytes = &[
497 0b0000_0001, // header, offset_size_minus_one=0 and version=1
498 0x03, // dictionary_size
499 0x00,
500 0x02,
501 0x01, // Doesn't increase monotonically
502 0x10,
503 b'c',
504 b'a',
505 b't',
506 b'd',
507 b'o',
508 b'g',
509 b'l',
510 b'a',
511 b'm',
512 b'b',
513 ];
514
515 let err = VariantMetadata::try_new(bytes).unwrap_err();
516 assert!(
517 matches!(err, ArrowError::InvalidArgumentError(_)),
518 "unexpected error: {err:?}"
519 );
520 }
521
522 #[test]
523 fn try_new_fails_non_monotonic2() {
524 // this test case checks whether offsets are monotonic in the full validation logic.
525
526 // 'cat', 'dog', 'lamb', "eel"
527 let bytes = &[
528 0b0000_0001, // header, offset_size_minus_one=0 and version=1
529 4, // dictionary_size
530 0x00,
531 0x02,
532 0x01, // Doesn't increase monotonically
533 0x10,
534 13,
535 b'c',
536 b'a',
537 b't',
538 b'd',
539 b'o',
540 b'g',
541 b'l',
542 b'a',
543 b'm',
544 b'b',
545 b'e',
546 b'e',
547 b'l',
548 ];
549
550 let err = VariantMetadata::try_new(bytes).unwrap_err();
551
552 assert!(
553 matches!(err, ArrowError::InvalidArgumentError(_)),
554 "unexpected error: {err:?}"
555 );
556 }
557
558 #[test]
559 fn try_new_truncated_offsets_inline() {
560 // Missing final offset
561 let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
562
563 let err = VariantMetadata::try_new(bytes).unwrap_err();
564 assert!(
565 matches!(err, ArrowError::InvalidArgumentError(_)),
566 "unexpected error: {err:?}"
567 );
568 }
569
570 #[test]
571 fn empty_string_is_valid() {
572 let bytes = &[
573 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
574 1,
575 0x00,
576 0x00,
577 ];
578 let metadata = VariantMetadata::try_new(bytes).unwrap();
579 assert_eq!(&metadata[0], "");
580
581 let bytes = &[
582 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
583 2,
584 0x00,
585 0x00,
586 0x02,
587 b'h',
588 b'i',
589 ];
590 let metadata = VariantMetadata::try_new(bytes).unwrap();
591 assert_eq!(&metadata[0], "");
592 assert_eq!(&metadata[1], "hi");
593
594 let bytes = &[
595 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
596 2,
597 0x00,
598 0x02,
599 0x02, // empty string is allowed, but must be first in a sorted dict
600 b'h',
601 b'i',
602 ];
603 let err = VariantMetadata::try_new(bytes).unwrap_err();
604 assert!(
605 matches!(err, ArrowError::InvalidArgumentError(_)),
606 "unexpected error: {err:?}"
607 );
608 }
609
610 #[test]
611 fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
612 // create a sorted object
613 let mut b = VariantBuilder::new();
614 let mut o = b.new_object();
615
616 o.insert("a", false);
617 o.insert("b", false);
618
619 o.finish();
620
621 let (m, _) = b.finish();
622
623 let m1 = VariantMetadata::new(&m);
624 assert!(m1.is_sorted());
625
626 // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
627 // Since field names are not unique, it is considered not sorted.
628 let metadata_bytes = vec![
629 0b0000_0001,
630 3, // dictionary size
631 0, // "a"
632 1, // "a"
633 2, // "b"
634 3,
635 b'a',
636 b'a',
637 b'b',
638 ];
639 let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
640 assert!(!m2.is_sorted());
641
642 assert_ne!(m1, m2);
643 }
644
645 #[test]
646 fn test_compare_sorted_dictionary_with_sorted_dictionary() {
647 // create a sorted object
648 let mut b = VariantBuilder::new();
649 let mut o = b.new_object();
650
651 o.insert("a", false);
652 o.insert("b", false);
653
654 o.finish();
655
656 let (m, _) = b.finish();
657
658 let m1 = VariantMetadata::new(&m);
659 let m2 = VariantMetadata::new(&m);
660
661 assert_eq!(m1, m2);
662 }
663}