parquet_variant/variant/metadata.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};
20
21use arrow_schema::ArrowError;
22
23/// Header structure for [`VariantMetadata`]
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub(crate) struct VariantMetadataHeader {
26 version: u8,
27 is_sorted: bool,
28 /// Note: This is `offset_size_minus_one` + 1
29 offset_size: OffsetSizeBytes,
30}
31
32// According to the spec this is currently always = 1, and so we store this const for validation
33// purposes and to make that visible.
34const CORRECT_VERSION_VALUE: u8 = 1;
35
36// The metadata header occupies one byte; use a named constant for readability
37const NUM_HEADER_BYTES: u32 = 1;
38
39impl VariantMetadataHeader {
40 // Hide the cast
41 const fn offset_size(&self) -> u32 {
42 self.offset_size as u32
43 }
44
45 // Avoid materializing this offset, since it's cheaply and safely computable
46 const fn first_offset_byte(&self) -> u32 {
47 NUM_HEADER_BYTES + self.offset_size()
48 }
49
50 /// Tries to construct the variant metadata header, which has the form
51 ///
52 /// ```text
53 /// 7 6 5 4 3 0
54 /// +-------+---+---+---------------+
55 /// header | | | | version |
56 /// +-------+---+---+---------------+
57 /// ^ ^
58 /// | +-- sorted_strings
59 /// +-- offset_size_minus_one
60 /// ```
61 ///
62 /// The version is a 4-bit value that must always contain the value 1.
63 /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
64 /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
65 /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
66 pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
67 let version = header_byte & 0x0F; // First four bits
68 if version != CORRECT_VERSION_VALUE {
69 let err_msg = format!(
70 "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
71 );
72 return Err(ArrowError::InvalidArgumentError(err_msg));
73 }
74 let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
75 let offset_size_minus_one = header_byte >> 6; // Last two bits
76 Ok(Self {
77 version,
78 is_sorted,
79 offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
80 })
81 }
82}
83
84/// [`Variant`] Metadata
85///
86/// See the [Variant Spec] file for more information
87///
88/// # Validation
89///
90/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
91/// underlying bytes are a valid encoding of variant metadata (see below).
92///
93/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
94/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
95/// validation cost is linear in the number of underlying bytes.
96///
97/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
98/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
99/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
100/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
101/// _unvalidated_ instance, if desired.
102///
103/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
104/// knows the underlying bytes were already validated previously, or if the caller intends to
105/// perform a small number of (fallible) accesses to a large dictionary.
106///
107/// A _validated_ variant [metadata instance guarantees that:
108///
109/// - header byte is valid
110/// - dictionary size is in bounds
111/// - offset array content is in-bounds
112/// - first offset is zero
113/// - last offset is in-bounds
114/// - all other offsets are in-bounds (*)
115/// - all offsets are monotonically increasing (*)
116/// - all values are valid utf-8 (*)
117///
118/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
119/// in the list above); it panics any of the other checks fails.
120///
121/// # Safety
122///
123/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
124/// it with infallible methods may cause panics but will never lead to undefined behavior.
125///
126/// [`Variant`]: crate::Variant
127/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
128#[derive(Debug, Clone, PartialEq)]
129pub struct VariantMetadata<'m> {
130 pub(crate) bytes: &'m [u8],
131 header: VariantMetadataHeader,
132 dictionary_size: u32,
133 first_value_byte: u32,
134 validated: bool,
135}
136
137// We don't want this to grow because it increases the size of VariantList and VariantObject, which
138// could increase the size of Variant. All those size increases could hurt performance.
139const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
140
141impl<'m> VariantMetadata<'m> {
142 /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
143 /// dictionary entries.
144 ///
145 /// [validation]: Self#Validation
146 pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
147 Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
148 }
149
150 /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
151 /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
152 /// indexing and iteration could also panic if the underlying bytes are invalid.
153 ///
154 /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
155 /// were already validated previously by other means, or if the caller expects a small number of
156 /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
157 /// needed, instead of paying expensive full validation up front).
158 ///
159 /// [validate]: Self#Validation
160 pub fn new(bytes: &'m [u8]) -> Self {
161 Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
162 }
163
164 // The actual constructor, which performs only basic (constant-const) validation.
165 pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
166 let header_byte = first_byte_from_slice(bytes)?;
167 let header = VariantMetadataHeader::try_new(header_byte)?;
168
169 // First element after header is dictionary size; the offset array immediately follows.
170 let dictionary_size =
171 header
172 .offset_size
173 .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
174
175 // Calculate the starting offset of the dictionary string bytes.
176 //
177 // There are dict_size + 1 offsets, and the value bytes immediately follow
178 // = (dict_size + 1) * offset_size + header.first_offset_byte()
179 let first_value_byte = dictionary_size
180 .checked_add(1)
181 .and_then(|n| n.checked_mul(header.offset_size()))
182 .and_then(|n| n.checked_add(header.first_offset_byte()))
183 .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
184
185 let mut new_self = Self {
186 bytes,
187 header,
188 dictionary_size,
189 first_value_byte,
190 validated: false,
191 };
192
193 // Validate just the first and last offset, ignoring the other offsets and all value bytes.
194 let first_offset = new_self.get_offset(0)?;
195 if first_offset != 0 {
196 return Err(ArrowError::InvalidArgumentError(format!(
197 "First offset is not zero: {first_offset}"
198 )));
199 }
200
201 // Use the last offset to upper-bound the byte slice
202 let last_offset = new_self
203 .get_offset(dictionary_size as _)?
204 .checked_add(first_value_byte)
205 .ok_or_else(|| overflow_error("variant metadata size"))?;
206 new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
207 Ok(new_self)
208 }
209
210 /// The number of metadata dictionary entries
211 pub fn len(&self) -> usize {
212 self.dictionary_size as _
213 }
214
215 /// True if this metadata dictionary contains no entries
216 pub fn is_empty(&self) -> bool {
217 self.len() == 0
218 }
219
220 /// True if this instance is fully [validated] for panic-free infallible accesses.
221 ///
222 /// [validated]: Self#Validation
223 pub fn is_fully_validated(&self) -> bool {
224 self.validated
225 }
226
227 /// Performs a full [validation] of this metadata dictionary and returns the result.
228 ///
229 /// [validation]: Self#Validation
230 pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
231 if !self.validated {
232 let offset_bytes = slice_from_slice(
233 self.bytes,
234 self.header.first_offset_byte() as _..self.first_value_byte as _,
235 )?;
236
237 // Verify the string values in the dictionary are UTF-8 encoded strings.
238 let value_buffer =
239 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
240
241 let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
242
243 if self.header.is_sorted {
244 // Validate the dictionary values are unique and lexicographically sorted
245 //
246 // Since we use the offsets to access dictionary values, this also validates
247 // offsets are in-bounds and monotonically increasing
248 let mut current_offset = offsets.next().unwrap_or(0);
249 let mut prev_value: Option<&str> = None;
250 for next_offset in offsets {
251 let current_value =
252 value_buffer
253 .get(current_offset..next_offset)
254 .ok_or_else(|| {
255 ArrowError::InvalidArgumentError(format!(
256 "range {current_offset}..{next_offset} is invalid or out of bounds"
257 ))
258 })?;
259
260 if let Some(prev_val) = prev_value {
261 if current_value <= prev_val {
262 return Err(ArrowError::InvalidArgumentError(
263 "dictionary values are not unique and ordered".to_string(),
264 ));
265 }
266 }
267
268 prev_value = Some(current_value);
269 current_offset = next_offset;
270 }
271 } else {
272 // Validate offsets are in-bounds and monotonically increasing
273 //
274 // Since shallow validation ensures the first and last offsets are in bounds,
275 // we can also verify all offsets are in-bounds by checking if
276 // offsets are monotonically increasing
277 if !offsets.is_sorted_by(|a, b| a < b) {
278 return Err(ArrowError::InvalidArgumentError(
279 "offsets not monotonically increasing".to_string(),
280 ));
281 }
282 }
283
284 self.validated = true;
285 }
286 Ok(self)
287 }
288
289 /// Whether the dictionary keys are sorted and unique
290 pub fn is_sorted(&self) -> bool {
291 self.header.is_sorted
292 }
293
294 /// The variant protocol version
295 pub const fn version(&self) -> u8 {
296 self.header.version
297 }
298
299 /// Gets an offset array entry by index.
300 ///
301 /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
302 /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
303 fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
304 let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
305 let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
306 self.header.offset_size.unpack_u32(bytes, i)
307 }
308
309 /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
310 /// underlying bytes are [invalid].
311 ///
312 /// [invalid]: Self#Validation
313 pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
314 let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
315 string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
316 }
317
318 /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
319 /// iterator encounters [invalid] data.
320 ///
321 /// [invalid]: Self#Validation
322 pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
323 (0..self.len()).map(|i| self.get(i))
324 }
325
326 /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
327 /// [`Self::iter_try`] to avoid panics due to invalid data.
328 ///
329 /// [unvalidated]: Self#Validation
330 pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
331 self.iter_try()
332 .map(|result| result.expect("Invalid metadata dictionary entry"))
333 }
334}
335
336/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
337/// [unvalidated] input could also panic if the underlying bytes are invalid.
338///
339/// [unvalidated]: Self#Validation
340impl std::ops::Index<usize> for VariantMetadata<'_> {
341 type Output = str;
342
343 fn index(&self, i: usize) -> &str {
344 self.get(i).expect("Invalid metadata dictionary entry")
345 }
346}
347
348#[cfg(test)]
349mod tests {
350
351 use crate::VariantBuilder;
352
353 use super::*;
354
355 /// `"cat"`, `"dog"` – valid metadata
356 #[test]
357 fn try_new_ok_inline() {
358 let bytes = &[
359 0b0000_0001, // header, offset_size_minus_one=0 and version=1
360 0x02, // dictionary_size (2 strings)
361 0x00,
362 0x03,
363 0x06,
364 b'c',
365 b'a',
366 b't',
367 b'd',
368 b'o',
369 b'g',
370 ];
371
372 let md = VariantMetadata::try_new(bytes).expect("should parse");
373 assert_eq!(md.len(), 2);
374 // Fields
375 assert_eq!(&md[0], "cat");
376 assert_eq!(&md[1], "dog");
377
378 // Offsets
379 assert_eq!(md.get_offset(0).unwrap(), 0x00);
380 assert_eq!(md.get_offset(1).unwrap(), 0x03);
381 assert_eq!(md.get_offset(2).unwrap(), 0x06);
382
383 let err = md.get_offset(3).unwrap_err();
384 assert!(
385 matches!(err, ArrowError::InvalidArgumentError(_)),
386 "unexpected error: {err:?}"
387 );
388
389 let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
390 assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
391 }
392
393 /// Too short buffer test (missing one required offset).
394 /// Should error with "metadata shorter than dictionary_size implies".
395 #[test]
396 fn try_new_missing_last_value() {
397 let bytes = &[
398 0b0000_0001, // header, offset_size_minus_one=0 and version=1
399 0x02, // dictionary_size = 2
400 0x00,
401 0x01,
402 0x02,
403 b'a',
404 b'b', // <-- we'll remove this
405 ];
406
407 let working_md = VariantMetadata::try_new(bytes).expect("should parse");
408 assert_eq!(working_md.len(), 2);
409 assert_eq!(&working_md[0], "a");
410 assert_eq!(&working_md[1], "b");
411
412 let truncated = &bytes[..bytes.len() - 1];
413
414 let err = VariantMetadata::try_new(truncated).unwrap_err();
415 assert!(
416 matches!(err, ArrowError::InvalidArgumentError(_)),
417 "unexpected error: {err:?}"
418 );
419 }
420
421 #[test]
422 fn try_new_fails_non_monotonic() {
423 // 'cat', 'dog', 'lamb'
424 let bytes = &[
425 0b0000_0001, // header, offset_size_minus_one=0 and version=1
426 0x03, // dictionary_size
427 0x00,
428 0x02,
429 0x01, // Doesn't increase monotonically
430 0x10,
431 b'c',
432 b'a',
433 b't',
434 b'd',
435 b'o',
436 b'g',
437 b'l',
438 b'a',
439 b'm',
440 b'b',
441 ];
442
443 let err = VariantMetadata::try_new(bytes).unwrap_err();
444 assert!(
445 matches!(err, ArrowError::InvalidArgumentError(_)),
446 "unexpected error: {err:?}"
447 );
448 }
449
450 #[test]
451 fn try_new_fails_non_monotonic2() {
452 // this test case checks whether offsets are monotonic in the full validation logic.
453
454 // 'cat', 'dog', 'lamb', "eel"
455 let bytes = &[
456 0b0000_0001, // header, offset_size_minus_one=0 and version=1
457 4, // dictionary_size
458 0x00,
459 0x02,
460 0x01, // Doesn't increase monotonically
461 0x10,
462 13,
463 b'c',
464 b'a',
465 b't',
466 b'd',
467 b'o',
468 b'g',
469 b'l',
470 b'a',
471 b'm',
472 b'b',
473 b'e',
474 b'e',
475 b'l',
476 ];
477
478 let err = VariantMetadata::try_new(bytes).unwrap_err();
479
480 assert!(
481 matches!(err, ArrowError::InvalidArgumentError(_)),
482 "unexpected error: {err:?}"
483 );
484 }
485
486 #[test]
487 fn try_new_truncated_offsets_inline() {
488 // Missing final offset
489 let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
490
491 let err = VariantMetadata::try_new(bytes).unwrap_err();
492 assert!(
493 matches!(err, ArrowError::InvalidArgumentError(_)),
494 "unexpected error: {err:?}"
495 );
496 }
497
498 #[test]
499 fn empty_string_is_valid() {
500 let bytes = &[
501 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
502 1,
503 0x00,
504 0x00,
505 ];
506 let metadata = VariantMetadata::try_new(bytes).unwrap();
507 assert_eq!(&metadata[0], "");
508
509 let bytes = &[
510 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
511 2,
512 0x00,
513 0x00,
514 0x02,
515 b'h',
516 b'i',
517 ];
518 let metadata = VariantMetadata::try_new(bytes).unwrap();
519 assert_eq!(&metadata[0], "");
520 assert_eq!(&metadata[1], "hi");
521
522 let bytes = &[
523 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
524 2,
525 0x00,
526 0x02,
527 0x02, // empty string is allowed, but must be first in a sorted dict
528 b'h',
529 b'i',
530 ];
531 let err = VariantMetadata::try_new(bytes).unwrap_err();
532 assert!(
533 matches!(err, ArrowError::InvalidArgumentError(_)),
534 "unexpected error: {err:?}"
535 );
536 }
537
538 #[test]
539 fn test_compare_sorted_dictionary_with_unsorted_dictionary() {
540 // create a sorted object
541 let mut b = VariantBuilder::new();
542 let mut o = b.new_object();
543
544 o.insert("a", false);
545 o.insert("b", false);
546
547 o.finish().unwrap();
548
549 let (m, _) = b.finish();
550
551 let m1 = VariantMetadata::new(&m);
552 assert!(m1.is_sorted());
553
554 // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
555 // Since field names are not unique, it is considered not sorted.
556 let metadata_bytes = vec![
557 0b0000_0001,
558 3, // dictionary size
559 0, // "a"
560 1, // "a"
561 2, // "b"
562 3,
563 b'a',
564 b'a',
565 b'b',
566 ];
567 let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap();
568 assert!(!m2.is_sorted());
569
570 assert_ne!(m1, m2);
571 }
572
573 #[test]
574 fn test_compare_sorted_dictionary_with_sorted_dictionary() {
575 // create a sorted object
576 let mut b = VariantBuilder::new();
577 let mut o = b.new_object();
578
579 o.insert("a", false);
580 o.insert("b", false);
581
582 o.finish().unwrap();
583
584 let (m, _) = b.finish();
585
586 let m1 = VariantMetadata::new(&m);
587 let m2 = VariantMetadata::new(&m);
588
589 assert_eq!(m1, m2);
590 }
591}