parquet_variant/variant/metadata.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{OffsetSizeBytes, map_bytes_to_offsets};
19use crate::utils::{
20 first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21 try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29 version: u8,
30 is_sorted: bool,
31 /// Note: This is `offset_size_minus_one` + 1
32 offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43 // Hide the cast
44 const fn offset_size(&self) -> u32 {
45 self.offset_size as u32
46 }
47
48 // Avoid materializing this offset, since it's cheaply and safely computable
49 const fn first_offset_byte(&self) -> u32 {
50 NUM_HEADER_BYTES + self.offset_size()
51 }
52
53 /// Tries to construct the variant metadata header, which has the form
54 ///
55 /// ```text
56 /// 7 6 5 4 3 0
57 /// +-------+---+---+---------------+
58 /// header | | | | version |
59 /// +-------+---+---+---------------+
60 /// ^ ^
61 /// | +-- sorted_strings
62 /// +-- offset_size_minus_one
63 /// ```
64 ///
65 /// The version is a 4-bit value that must always contain the value 1.
66 /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67 /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68 /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69 pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70 let version = header_byte & 0x0F; // First four bits
71 if version != CORRECT_VERSION_VALUE {
72 let err_msg = format!(
73 "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74 );
75 return Err(ArrowError::InvalidArgumentError(err_msg));
76 }
77 let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78 let offset_size_minus_one = header_byte >> 6; // Last two bits
79 Ok(Self {
80 version,
81 is_sorted,
82 offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83 })
84 }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133 /// (Only) the bytes that make up this metadata instance.
134 pub(crate) bytes: &'m [u8],
135 header: VariantMetadataHeader,
136 dictionary_size: u32,
137 first_value_byte: u32,
138 validated: bool,
139}
140
141// We don't want this to grow because it increases the size of VariantList and VariantObject, which
142// could increase the size of Variant. All those size increases could hurt performance.
143#[cfg(target_pointer_width = "64")]
144const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
145
146#[cfg(target_pointer_width = "32")]
147const _: () = crate::utils::expect_size_of::<VariantMetadata>(20);
148
149/// The canonical byte slice corresponding to an empty metadata dictionary.
150///
151/// ```
152/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder};
153/// let mut metadata_builder = WritableMetadataBuilder::default();
154/// metadata_builder.finish();
155/// let metadata_bytes = metadata_builder.into_inner();
156/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES);
157/// ```
158pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0];
159
160/// The empty metadata dictionary.
161///
162/// ```
163/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder};
164/// let mut metadata_builder = WritableMetadataBuilder::default();
165/// metadata_builder.finish();
166/// let metadata_bytes = metadata_builder.into_inner();
167/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
168/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA);
169/// ```
170pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata {
171 bytes: EMPTY_VARIANT_METADATA_BYTES,
172 header: VariantMetadataHeader {
173 version: CORRECT_VERSION_VALUE,
174 is_sorted: false,
175 offset_size: OffsetSizeBytes::One,
176 },
177 dictionary_size: 0,
178 first_value_byte: 3,
179 validated: true,
180};
181
182impl<'m> VariantMetadata<'m> {
183 /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
184 /// dictionary entries.
185 ///
186 /// [validation]: Self#Validation
187 pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
188 Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
189 }
190
191 /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
192 /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
193 /// indexing and iteration could also panic if the underlying bytes are invalid.
194 ///
195 /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
196 /// were already validated previously by other means, or if the caller expects a small number of
197 /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
198 /// needed, instead of paying expensive full validation up front).
199 ///
200 /// [validate]: Self#Validation
201 pub fn new(bytes: &'m [u8]) -> Self {
202 Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
203 }
204
205 // The actual constructor, which performs only basic (constant-const) validation.
206 pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
207 let header_byte = first_byte_from_slice(bytes)?;
208 let header = VariantMetadataHeader::try_new(header_byte)?;
209
210 // First element after header is dictionary size; the offset array immediately follows.
211 let dictionary_size =
212 header
213 .offset_size
214 .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
215
216 // Calculate the starting offset of the dictionary string bytes.
217 //
218 // There are dict_size + 1 offsets, and the value bytes immediately follow
219 // = (dict_size + 1) * offset_size + header.first_offset_byte()
220 let first_value_byte = dictionary_size
221 .checked_add(1)
222 .and_then(|n| n.checked_mul(header.offset_size()))
223 .and_then(|n| n.checked_add(header.first_offset_byte()))
224 .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
225
226 let mut new_self = Self {
227 bytes,
228 header,
229 dictionary_size,
230 first_value_byte,
231 validated: false,
232 };
233
234 // Validate just the first and last offset, ignoring the other offsets and all value bytes.
235 let first_offset = new_self.get_offset(0)?;
236 if first_offset != 0 {
237 return Err(ArrowError::InvalidArgumentError(format!(
238 "First offset is not zero: {first_offset}"
239 )));
240 }
241
242 // Use the last offset to upper-bound the byte slice
243 let last_offset = new_self
244 .get_offset(dictionary_size as _)?
245 .checked_add(first_value_byte)
246 .ok_or_else(|| overflow_error("variant metadata size"))?;
247 new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
248 Ok(new_self)
249 }
250
251 /// The number of metadata dictionary entries
252 pub fn len(&self) -> usize {
253 self.dictionary_size as _
254 }
255
256 /// True if this metadata dictionary contains no entries
257 pub fn is_empty(&self) -> bool {
258 self.len() == 0
259 }
260
261 /// True if this instance is fully [validated] for panic-free infallible accesses.
262 ///
263 /// [validated]: Self#Validation
264 pub fn is_fully_validated(&self) -> bool {
265 self.validated
266 }
267
268 /// Performs a full [validation] of this metadata dictionary and returns the result.
269 ///
270 /// [validation]: Self#Validation
271 pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
272 if !self.validated {
273 let offset_bytes = slice_from_slice(
274 self.bytes,
275 self.header.first_offset_byte() as _..self.first_value_byte as _,
276 )?;
277
278 // Verify the string values in the dictionary are UTF-8 encoded strings.
279 let value_buffer =
280 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
281
282 let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
283
284 if self.header.is_sorted {
285 // Validate the dictionary values are unique and lexicographically sorted
286 //
287 // Since we use the offsets to access dictionary values, this also validates
288 // offsets are in-bounds and monotonically increasing
289 let mut current_offset = offsets.next().unwrap_or(0);
290 let mut prev_value: Option<&str> = None;
291 for next_offset in offsets {
292 let current_value = value_buffer.get(current_offset..next_offset).ok_or_else(
293 || {
294 ArrowError::InvalidArgumentError(format!(
295 "range {current_offset}..{next_offset} is invalid or out of bounds"
296 ))
297 },
298 )?;
299
300 if let Some(prev_val) = prev_value {
301 if current_value <= prev_val {
302 return Err(ArrowError::InvalidArgumentError(
303 "dictionary values are not unique and ordered".to_string(),
304 ));
305 }
306 }
307
308 prev_value = Some(current_value);
309 current_offset = next_offset;
310 }
311 } else {
312 // Validate offsets are in-bounds and monotonically increasing
313 //
314 // Since shallow validation ensures the first and last offsets are in bounds,
315 // we can also verify all offsets are in-bounds by checking if
316 // offsets are monotonically increasing
317 if !offsets.is_sorted_by(|a, b| a < b) {
318 return Err(ArrowError::InvalidArgumentError(
319 "offsets not monotonically increasing".to_string(),
320 ));
321 }
322 }
323
324 self.validated = true;
325 }
326 Ok(self)
327 }
328
329 /// Whether the dictionary keys are sorted and unique
330 pub fn is_sorted(&self) -> bool {
331 self.header.is_sorted
332 }
333
334 /// The variant protocol version
335 pub const fn version(&self) -> u8 {
336 self.header.version
337 }
338
339 /// Gets an offset into the dictionary entry by index.
340 ///
341 /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
342 /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
343 fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
344 let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
345 let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
346 self.header.offset_size.unpack_u32(bytes, i)
347 }
348
349 /// Returns the total size, in bytes, of the metadata.
350 ///
351 /// Note this value may be smaller than what was passed to [`Self::new`] or
352 /// [`Self::try_new`] if the input was larger than necessary to encode the
353 /// metadata dictionary.
354 pub fn size(&self) -> usize {
355 self.bytes.len()
356 }
357
358 /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
359 /// underlying bytes are [invalid].
360 ///
361 /// [invalid]: Self#Validation
362 pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
363 let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
364 string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
365 }
366
367 // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
368 // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
369 // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
370 fn get_impl(&self, i: usize) -> &'m str {
371 self.get(i).expect("Invalid metadata dictionary entry")
372 }
373
374 /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
375 /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
376 /// otherwise.
377 ///
378 /// WARNING: This method panics if the underlying bytes are [invalid].
379 ///
380 /// [invalid]: Self#Validation
381 pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
382 let field_id = if self.is_sorted() && self.len() > 10 {
383 // Binary search is faster for a not-tiny sorted metadata dictionary
384 let cmp = |i| Some(self.get_impl(i).cmp(field_name));
385 try_binary_search_range_by(0..self.len(), cmp)?.ok()?
386 } else {
387 // Fall back to Linear search for tiny or unsorted dictionary
388 (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
389 };
390 Some((field_id as u32, self.get_impl(field_id)))
391 }
392
393 /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
394 /// iterator encounters [invalid] data.
395 ///
396 /// [invalid]: Self#Validation
397 pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
398 (0..self.len()).map(|i| self.get(i))
399 }
400
401 /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
402 /// [`Self::iter_try`] to avoid panics due to invalid data.
403 ///
404 /// [unvalidated]: Self#Validation
405 pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
406 self.iter_try()
407 .map(|result| result.expect("Invalid metadata dictionary entry"))
408 }
409}
410
411/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
412/// [unvalidated] input could also panic if the underlying bytes are invalid.
413///
414/// [unvalidated]: Self#Validation
415impl std::ops::Index<usize> for VariantMetadata<'_> {
416 type Output = str;
417
418 fn index(&self, i: usize) -> &str {
419 self.get_impl(i)
420 }
421}
422
423#[cfg(test)]
424mod tests {
425
426 use crate::VariantBuilder;
427
428 use super::*;
429
430 /// `"cat"`, `"dog"` – valid metadata
431 #[test]
432 fn try_new_ok_inline() {
433 let bytes = &[
434 0b0000_0001, // header, offset_size_minus_one=0 and version=1
435 0x02, // dictionary_size (2 strings)
436 0x00,
437 0x03,
438 0x06,
439 b'c',
440 b'a',
441 b't',
442 b'd',
443 b'o',
444 b'g',
445 ];
446
447 let md = VariantMetadata::try_new(bytes).expect("should parse");
448 assert_eq!(md.len(), 2);
449 // Fields
450 assert_eq!(&md[0], "cat");
451 assert_eq!(&md[1], "dog");
452
453 // Offsets
454 assert_eq!(md.get_offset(0).unwrap(), 0x00);
455 assert_eq!(md.get_offset(1).unwrap(), 0x03);
456 assert_eq!(md.get_offset(2).unwrap(), 0x06);
457
458 let err = md.get_offset(3).unwrap_err();
459 assert!(
460 matches!(err, ArrowError::InvalidArgumentError(_)),
461 "unexpected error: {err:?}"
462 );
463
464 let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
465 assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
466 }
467
468 /// Too short buffer test (missing one required offset).
469 /// Should error with "metadata shorter than dictionary_size implies".
470 #[test]
471 fn try_new_missing_last_value() {
472 let bytes = &[
473 0b0000_0001, // header, offset_size_minus_one=0 and version=1
474 0x02, // dictionary_size = 2
475 0x00,
476 0x01,
477 0x02,
478 b'a',
479 b'b', // <-- we'll remove this
480 ];
481
482 let working_md = VariantMetadata::try_new(bytes).expect("should parse");
483 assert_eq!(working_md.len(), 2);
484 assert_eq!(&working_md[0], "a");
485 assert_eq!(&working_md[1], "b");
486
487 let truncated = &bytes[..bytes.len() - 1];
488
489 let err = VariantMetadata::try_new(truncated).unwrap_err();
490 assert!(
491 matches!(err, ArrowError::InvalidArgumentError(_)),
492 "unexpected error: {err:?}"
493 );
494 }
495
496 #[test]
497 fn try_new_fails_non_monotonic() {
498 // 'cat', 'dog', 'lamb'
499 let bytes = &[
500 0b0000_0001, // header, offset_size_minus_one=0 and version=1
501 0x03, // dictionary_size
502 0x00,
503 0x02,
504 0x01, // Doesn't increase monotonically
505 0x10,
506 b'c',
507 b'a',
508 b't',
509 b'd',
510 b'o',
511 b'g',
512 b'l',
513 b'a',
514 b'm',
515 b'b',
516 ];
517
518 let err = VariantMetadata::try_new(bytes).unwrap_err();
519 assert!(
520 matches!(err, ArrowError::InvalidArgumentError(_)),
521 "unexpected error: {err:?}"
522 );
523 }
524
525 #[test]
526 fn try_new_fails_non_monotonic2() {
527 // this test case checks whether offsets are monotonic in the full validation logic.
528
529 // 'cat', 'dog', 'lamb', "eel"
530 let bytes = &[
531 0b0000_0001, // header, offset_size_minus_one=0 and version=1
532 4, // dictionary_size
533 0x00,
534 0x02,
535 0x01, // Doesn't increase monotonically
536 0x10,
537 13,
538 b'c',
539 b'a',
540 b't',
541 b'd',
542 b'o',
543 b'g',
544 b'l',
545 b'a',
546 b'm',
547 b'b',
548 b'e',
549 b'e',
550 b'l',
551 ];
552
553 let err = VariantMetadata::try_new(bytes).unwrap_err();
554
555 assert!(
556 matches!(err, ArrowError::InvalidArgumentError(_)),
557 "unexpected error: {err:?}"
558 );
559 }
560
561 #[test]
562 fn try_new_truncated_offsets_inline() {
563 // Missing final offset
564 let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
565
566 let err = VariantMetadata::try_new(bytes).unwrap_err();
567 assert!(
568 matches!(err, ArrowError::InvalidArgumentError(_)),
569 "unexpected error: {err:?}"
570 );
571 }
572
573 #[test]
574 fn empty_string_is_valid() {
575 let bytes = &[
576 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
577 1,
578 0x00,
579 0x00,
580 ];
581 let metadata = VariantMetadata::try_new(bytes).unwrap();
582 assert_eq!(&metadata[0], "");
583
584 let bytes = &[
585 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
586 2,
587 0x00,
588 0x00,
589 0x02,
590 b'h',
591 b'i',
592 ];
593 let metadata = VariantMetadata::try_new(bytes).unwrap();
594 assert_eq!(&metadata[0], "");
595 assert_eq!(&metadata[1], "hi");
596
597 let bytes = &[
598 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
599 2,
600 0x00,
601 0x02,
602 0x02, // empty string is allowed, but must be first in a sorted dict
603 b'h',
604 b'i',
605 ];
606 let err = VariantMetadata::try_new(bytes).unwrap_err();
607 assert!(
608 matches!(err, ArrowError::InvalidArgumentError(_)),
609 "unexpected error: {err:?}"
610 );
611 }
612
613 #[test]
614 fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
615 // create a sorted object
616 let mut b = VariantBuilder::new();
617 let mut o = b.new_object();
618
619 o.insert("a", false);
620 o.insert("b", false);
621
622 o.finish();
623
624 let (m, _) = b.finish();
625
626 let m1 = VariantMetadata::new(&m);
627 assert!(m1.is_sorted());
628
629 // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
630 // Since field names are not unique, it is considered not sorted.
631 let metadata_bytes = vec![
632 0b0000_0001,
633 3, // dictionary size
634 0, // "a"
635 1, // "a"
636 2, // "b"
637 3,
638 b'a',
639 b'a',
640 b'b',
641 ];
642 let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
643 assert!(!m2.is_sorted());
644
645 assert_ne!(m1, m2);
646 }
647
648 #[test]
649 fn test_compare_sorted_dictionary_with_sorted_dictionary() {
650 // create a sorted object
651 let mut b = VariantBuilder::new();
652 let mut o = b.new_object();
653
654 o.insert("a", false);
655 o.insert("b", false);
656
657 o.finish();
658
659 let (m, _) = b.finish();
660
661 let m1 = VariantMetadata::new(&m);
662 let m2 = VariantMetadata::new(&m);
663
664 assert_eq!(m1, m2);
665 }
666}