parquet_variant/variant/metadata.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{
20 first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21 try_binary_search_range_by,
22};
23
24use arrow_schema::ArrowError;
25
26/// Header structure for [`VariantMetadata`]
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub(crate) struct VariantMetadataHeader {
29 version: u8,
30 is_sorted: bool,
31 /// Note: This is `offset_size_minus_one` + 1
32 offset_size: OffsetSizeBytes,
33}
34
35// According to the spec this is currently always = 1, and so we store this const for validation
36// purposes and to make that visible.
37const CORRECT_VERSION_VALUE: u8 = 1;
38
39// The metadata header occupies one byte; use a named constant for readability
40const NUM_HEADER_BYTES: u32 = 1;
41
42impl VariantMetadataHeader {
43 // Hide the cast
44 const fn offset_size(&self) -> u32 {
45 self.offset_size as u32
46 }
47
48 // Avoid materializing this offset, since it's cheaply and safely computable
49 const fn first_offset_byte(&self) -> u32 {
50 NUM_HEADER_BYTES + self.offset_size()
51 }
52
53 /// Tries to construct the variant metadata header, which has the form
54 ///
55 /// ```text
56 /// 7 6 5 4 3 0
57 /// +-------+---+---+---------------+
58 /// header | | | | version |
59 /// +-------+---+---+---------------+
60 /// ^ ^
61 /// | +-- sorted_strings
62 /// +-- offset_size_minus_one
63 /// ```
64 ///
65 /// The version is a 4-bit value that must always contain the value 1.
66 /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
67 /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
68 /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
69 pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
70 let version = header_byte & 0x0F; // First four bits
71 if version != CORRECT_VERSION_VALUE {
72 let err_msg = format!(
73 "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
74 );
75 return Err(ArrowError::InvalidArgumentError(err_msg));
76 }
77 let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
78 let offset_size_minus_one = header_byte >> 6; // Last two bits
79 Ok(Self {
80 version,
81 is_sorted,
82 offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
83 })
84 }
85}
86
87/// [`Variant`] Metadata
88///
89/// See the [Variant Spec] file for more information
90///
91/// # Validation
92///
93/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
94/// underlying bytes are a valid encoding of variant metadata (see below).
95///
96/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
97/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
98/// validation cost is linear in the number of underlying bytes.
99///
100/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
101/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
102/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
103/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
104/// _unvalidated_ instance, if desired.
105///
106/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
107/// knows the underlying bytes were already validated previously, or if the caller intends to
108/// perform a small number of (fallible) accesses to a large dictionary.
109///
110/// A _validated_ variant [metadata instance guarantees that:
111///
112/// - header byte is valid
113/// - dictionary size is in bounds
114/// - offset array content is in-bounds
115/// - first offset is zero
116/// - last offset is in-bounds
117/// - all other offsets are in-bounds (*)
118/// - all offsets are monotonically increasing (*)
119/// - all values are valid utf-8 (*)
120///
121/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
122/// in the list above); it panics any of the other checks fails.
123///
124/// # Safety
125///
126/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
127/// it with infallible methods may cause panics but will never lead to undefined behavior.
128///
129/// [`Variant`]: crate::Variant
130/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131#[derive(Debug, Clone, PartialEq)]
132pub struct VariantMetadata<'m> {
133 pub(crate) bytes: &'m [u8],
134 header: VariantMetadataHeader,
135 dictionary_size: u32,
136 first_value_byte: u32,
137 validated: bool,
138}
139
140// We don't want this to grow because it increases the size of VariantList and VariantObject, which
141// could increase the size of Variant. All those size increases could hurt performance.
142const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
143
144impl<'m> VariantMetadata<'m> {
145 /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
146 /// dictionary entries.
147 ///
148 /// [validation]: Self#Validation
149 pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
150 Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
151 }
152
153 /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
154 /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
155 /// indexing and iteration could also panic if the underlying bytes are invalid.
156 ///
157 /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
158 /// were already validated previously by other means, or if the caller expects a small number of
159 /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
160 /// needed, instead of paying expensive full validation up front).
161 ///
162 /// [validate]: Self#Validation
163 pub fn new(bytes: &'m [u8]) -> Self {
164 Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
165 }
166
167 // The actual constructor, which performs only basic (constant-const) validation.
168 pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
169 let header_byte = first_byte_from_slice(bytes)?;
170 let header = VariantMetadataHeader::try_new(header_byte)?;
171
172 // First element after header is dictionary size; the offset array immediately follows.
173 let dictionary_size =
174 header
175 .offset_size
176 .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
177
178 // Calculate the starting offset of the dictionary string bytes.
179 //
180 // There are dict_size + 1 offsets, and the value bytes immediately follow
181 // = (dict_size + 1) * offset_size + header.first_offset_byte()
182 let first_value_byte = dictionary_size
183 .checked_add(1)
184 .and_then(|n| n.checked_mul(header.offset_size()))
185 .and_then(|n| n.checked_add(header.first_offset_byte()))
186 .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
187
188 let mut new_self = Self {
189 bytes,
190 header,
191 dictionary_size,
192 first_value_byte,
193 validated: false,
194 };
195
196 // Validate just the first and last offset, ignoring the other offsets and all value bytes.
197 let first_offset = new_self.get_offset(0)?;
198 if first_offset != 0 {
199 return Err(ArrowError::InvalidArgumentError(format!(
200 "First offset is not zero: {first_offset}"
201 )));
202 }
203
204 // Use the last offset to upper-bound the byte slice
205 let last_offset = new_self
206 .get_offset(dictionary_size as _)?
207 .checked_add(first_value_byte)
208 .ok_or_else(|| overflow_error("variant metadata size"))?;
209 new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
210 Ok(new_self)
211 }
212
213 /// The number of metadata dictionary entries
214 pub fn len(&self) -> usize {
215 self.dictionary_size as _
216 }
217
218 /// True if this metadata dictionary contains no entries
219 pub fn is_empty(&self) -> bool {
220 self.len() == 0
221 }
222
223 /// True if this instance is fully [validated] for panic-free infallible accesses.
224 ///
225 /// [validated]: Self#Validation
226 pub fn is_fully_validated(&self) -> bool {
227 self.validated
228 }
229
230 /// Performs a full [validation] of this metadata dictionary and returns the result.
231 ///
232 /// [validation]: Self#Validation
233 pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
234 if !self.validated {
235 let offset_bytes = slice_from_slice(
236 self.bytes,
237 self.header.first_offset_byte() as _..self.first_value_byte as _,
238 )?;
239
240 // Verify the string values in the dictionary are UTF-8 encoded strings.
241 let value_buffer =
242 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
243
244 let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
245
246 if self.header.is_sorted {
247 // Validate the dictionary values are unique and lexicographically sorted
248 //
249 // Since we use the offsets to access dictionary values, this also validates
250 // offsets are in-bounds and monotonically increasing
251 let mut current_offset = offsets.next().unwrap_or(0);
252 let mut prev_value: Option<&str> = None;
253 for next_offset in offsets {
254 let current_value =
255 value_buffer
256 .get(current_offset..next_offset)
257 .ok_or_else(|| {
258 ArrowError::InvalidArgumentError(format!(
259 "range {current_offset}..{next_offset} is invalid or out of bounds"
260 ))
261 })?;
262
263 if let Some(prev_val) = prev_value {
264 if current_value <= prev_val {
265 return Err(ArrowError::InvalidArgumentError(
266 "dictionary values are not unique and ordered".to_string(),
267 ));
268 }
269 }
270
271 prev_value = Some(current_value);
272 current_offset = next_offset;
273 }
274 } else {
275 // Validate offsets are in-bounds and monotonically increasing
276 //
277 // Since shallow validation ensures the first and last offsets are in bounds,
278 // we can also verify all offsets are in-bounds by checking if
279 // offsets are monotonically increasing
280 if !offsets.is_sorted_by(|a, b| a < b) {
281 return Err(ArrowError::InvalidArgumentError(
282 "offsets not monotonically increasing".to_string(),
283 ));
284 }
285 }
286
287 self.validated = true;
288 }
289 Ok(self)
290 }
291
292 /// Whether the dictionary keys are sorted and unique
293 pub fn is_sorted(&self) -> bool {
294 self.header.is_sorted
295 }
296
297 /// The variant protocol version
298 pub const fn version(&self) -> u8 {
299 self.header.version
300 }
301
302 /// Gets an offset array entry by index.
303 ///
304 /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
305 /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
306 fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
307 let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
308 let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
309 self.header.offset_size.unpack_u32(bytes, i)
310 }
311
312 /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
313 /// underlying bytes are [invalid].
314 ///
315 /// [invalid]: Self#Validation
316 pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
317 let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
318 string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
319 }
320
321 // Helper method used by our `impl Index` and also by `get_entry`. Panics if the underlying
322 // bytes are invalid. Needed because the `Index` trait forces the returned result to have the
323 // lifetime of `self` instead of the string's own (longer) lifetime `'m`.
324 fn get_impl(&self, i: usize) -> &'m str {
325 self.get(i).expect("Invalid metadata dictionary entry")
326 }
327
328 /// Attempts to retrieve a dictionary entry and its field id, returning None if the requested field
329 /// name is not present. The search cost is logarithmic if [`Self::is_sorted`] and linear
330 /// otherwise.
331 ///
332 /// WARNING: This method panics if the underlying bytes are [invalid].
333 ///
334 /// [invalid]: Self#Validation
335 pub fn get_entry(&self, field_name: &str) -> Option<(u32, &'m str)> {
336 let field_id = if self.is_sorted() && self.len() > 10 {
337 // Binary search is faster for a not-tiny sorted metadata dictionary
338 let cmp = |i| Some(self.get_impl(i).cmp(field_name));
339 try_binary_search_range_by(0..self.len(), cmp)?.ok()?
340 } else {
341 // Fall back to Linear search for tiny or unsorted dictionary
342 (0..self.len()).find(|i| self.get_impl(*i) == field_name)?
343 };
344 Some((field_id as u32, self.get_impl(field_id)))
345 }
346
347 /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
348 /// iterator encounters [invalid] data.
349 ///
350 /// [invalid]: Self#Validation
351 pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
352 (0..self.len()).map(|i| self.get(i))
353 }
354
355 /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
356 /// [`Self::iter_try`] to avoid panics due to invalid data.
357 ///
358 /// [unvalidated]: Self#Validation
359 pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
360 self.iter_try()
361 .map(|result| result.expect("Invalid metadata dictionary entry"))
362 }
363}
364
365/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
366/// [unvalidated] input could also panic if the underlying bytes are invalid.
367///
368/// [unvalidated]: Self#Validation
369impl std::ops::Index<usize> for VariantMetadata<'_> {
370 type Output = str;
371
372 fn index(&self, i: usize) -> &str {
373 self.get_impl(i)
374 }
375}
376
377#[cfg(test)]
378mod tests {
379
380 use crate::VariantBuilder;
381
382 use super::*;
383
384 /// `"cat"`, `"dog"` – valid metadata
385 #[test]
386 fn try_new_ok_inline() {
387 let bytes = &[
388 0b0000_0001, // header, offset_size_minus_one=0 and version=1
389 0x02, // dictionary_size (2 strings)
390 0x00,
391 0x03,
392 0x06,
393 b'c',
394 b'a',
395 b't',
396 b'd',
397 b'o',
398 b'g',
399 ];
400
401 let md = VariantMetadata::try_new(bytes).expect("should parse");
402 assert_eq!(md.len(), 2);
403 // Fields
404 assert_eq!(&md[0], "cat");
405 assert_eq!(&md[1], "dog");
406
407 // Offsets
408 assert_eq!(md.get_offset(0).unwrap(), 0x00);
409 assert_eq!(md.get_offset(1).unwrap(), 0x03);
410 assert_eq!(md.get_offset(2).unwrap(), 0x06);
411
412 let err = md.get_offset(3).unwrap_err();
413 assert!(
414 matches!(err, ArrowError::InvalidArgumentError(_)),
415 "unexpected error: {err:?}"
416 );
417
418 let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
419 assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
420 }
421
422 /// Too short buffer test (missing one required offset).
423 /// Should error with "metadata shorter than dictionary_size implies".
424 #[test]
425 fn try_new_missing_last_value() {
426 let bytes = &[
427 0b0000_0001, // header, offset_size_minus_one=0 and version=1
428 0x02, // dictionary_size = 2
429 0x00,
430 0x01,
431 0x02,
432 b'a',
433 b'b', // <-- we'll remove this
434 ];
435
436 let working_md = VariantMetadata::try_new(bytes).expect("should parse");
437 assert_eq!(working_md.len(), 2);
438 assert_eq!(&working_md[0], "a");
439 assert_eq!(&working_md[1], "b");
440
441 let truncated = &bytes[..bytes.len() - 1];
442
443 let err = VariantMetadata::try_new(truncated).unwrap_err();
444 assert!(
445 matches!(err, ArrowError::InvalidArgumentError(_)),
446 "unexpected error: {err:?}"
447 );
448 }
449
450 #[test]
451 fn try_new_fails_non_monotonic() {
452 // 'cat', 'dog', 'lamb'
453 let bytes = &[
454 0b0000_0001, // header, offset_size_minus_one=0 and version=1
455 0x03, // dictionary_size
456 0x00,
457 0x02,
458 0x01, // Doesn't increase monotonically
459 0x10,
460 b'c',
461 b'a',
462 b't',
463 b'd',
464 b'o',
465 b'g',
466 b'l',
467 b'a',
468 b'm',
469 b'b',
470 ];
471
472 let err = VariantMetadata::try_new(bytes).unwrap_err();
473 assert!(
474 matches!(err, ArrowError::InvalidArgumentError(_)),
475 "unexpected error: {err:?}"
476 );
477 }
478
479 #[test]
480 fn try_new_fails_non_monotonic2() {
481 // this test case checks whether offsets are monotonic in the full validation logic.
482
483 // 'cat', 'dog', 'lamb', "eel"
484 let bytes = &[
485 0b0000_0001, // header, offset_size_minus_one=0 and version=1
486 4, // dictionary_size
487 0x00,
488 0x02,
489 0x01, // Doesn't increase monotonically
490 0x10,
491 13,
492 b'c',
493 b'a',
494 b't',
495 b'd',
496 b'o',
497 b'g',
498 b'l',
499 b'a',
500 b'm',
501 b'b',
502 b'e',
503 b'e',
504 b'l',
505 ];
506
507 let err = VariantMetadata::try_new(bytes).unwrap_err();
508
509 assert!(
510 matches!(err, ArrowError::InvalidArgumentError(_)),
511 "unexpected error: {err:?}"
512 );
513 }
514
515 #[test]
516 fn try_new_truncated_offsets_inline() {
517 // Missing final offset
518 let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
519
520 let err = VariantMetadata::try_new(bytes).unwrap_err();
521 assert!(
522 matches!(err, ArrowError::InvalidArgumentError(_)),
523 "unexpected error: {err:?}"
524 );
525 }
526
527 #[test]
528 fn empty_string_is_valid() {
529 let bytes = &[
530 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
531 1,
532 0x00,
533 0x00,
534 ];
535 let metadata = VariantMetadata::try_new(bytes).unwrap();
536 assert_eq!(&metadata[0], "");
537
538 let bytes = &[
539 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
540 2,
541 0x00,
542 0x00,
543 0x02,
544 b'h',
545 b'i',
546 ];
547 let metadata = VariantMetadata::try_new(bytes).unwrap();
548 assert_eq!(&metadata[0], "");
549 assert_eq!(&metadata[1], "hi");
550
551 let bytes = &[
552 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
553 2,
554 0x00,
555 0x02,
556 0x02, // empty string is allowed, but must be first in a sorted dict
557 b'h',
558 b'i',
559 ];
560 let err = VariantMetadata::try_new(bytes).unwrap_err();
561 assert!(
562 matches!(err, ArrowError::InvalidArgumentError(_)),
563 "unexpected error: {err:?}"
564 );
565 }
566
567 #[test]
568 fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
569 // create a sorted object
570 let mut b = VariantBuilder::new();
571 let mut o = b.new_object();
572
573 o.insert("a", false);
574 o.insert("b", false);
575
576 o.finish();
577
578 let (m, _) = b.finish();
579
580 let m1 = VariantMetadata::new(&m);
581 assert!(m1.is_sorted());
582
583 // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
584 // Since field names are not unique, it is considered not sorted.
585 let metadata_bytes = vec![
586 0b0000_0001,
587 3, // dictionary size
588 0, // "a"
589 1, // "a"
590 2, // "b"
591 3,
592 b'a',
593 b'a',
594 b'b',
595 ];
596 let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
597 assert!(!m2.is_sorted());
598
599 assert_ne!(m1, m2);
600 }
601
602 #[test]
603 fn test_compare_sorted_dictionary_with_sorted_dictionary() {
604 // create a sorted object
605 let mut b = VariantBuilder::new();
606 let mut o = b.new_object();
607
608 o.insert("a", false);
609 o.insert("b", false);
610
611 o.finish();
612
613 let (m, _) = b.finish();
614
615 let m1 = VariantMetadata::new(&m);
616 let m2 = VariantMetadata::new(&m);
617
618 assert_eq!(m1, m2);
619 }
620}