parquet_variant/variant/metadata.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};
20
21use arrow_schema::ArrowError;
22
23/// Header structure for [`VariantMetadata`]
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub(crate) struct VariantMetadataHeader {
26 version: u8,
27 is_sorted: bool,
28 /// Note: This is `offset_size_minus_one` + 1
29 offset_size: OffsetSizeBytes,
30}
31
32// According to the spec this is currently always = 1, and so we store this const for validation
33// purposes and to make that visible.
34const CORRECT_VERSION_VALUE: u8 = 1;
35
36// The metadata header occupies one byte; use a named constant for readability
37const NUM_HEADER_BYTES: u32 = 1;
38
39impl VariantMetadataHeader {
40 // Hide the cast
41 const fn offset_size(&self) -> u32 {
42 self.offset_size as u32
43 }
44
45 // Avoid materializing this offset, since it's cheaply and safely computable
46 const fn first_offset_byte(&self) -> u32 {
47 NUM_HEADER_BYTES + self.offset_size()
48 }
49
50 /// Tries to construct the variant metadata header, which has the form
51 ///
52 /// ```text
53 /// 7 6 5 4 3 0
54 /// +-------+---+---+---------------+
55 /// header | | | | version |
56 /// +-------+---+---+---------------+
57 /// ^ ^
58 /// | +-- sorted_strings
59 /// +-- offset_size_minus_one
60 /// ```
61 ///
62 /// The version is a 4-bit value that must always contain the value 1.
63 /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique.
64 /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field.
65 /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1
66 pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
67 let version = header_byte & 0x0F; // First four bits
68 if version != CORRECT_VERSION_VALUE {
69 let err_msg = format!(
70 "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {version:b}",
71 );
72 return Err(ArrowError::InvalidArgumentError(err_msg));
73 }
74 let is_sorted = (header_byte & 0x10) != 0; // Fifth bit
75 let offset_size_minus_one = header_byte >> 6; // Last two bits
76 Ok(Self {
77 version,
78 is_sorted,
79 offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?,
80 })
81 }
82}
83
84/// [`Variant`] Metadata
85///
86/// See the [Variant Spec] file for more information
87///
88/// # Validation
89///
90/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
91/// underlying bytes are a valid encoding of variant metadata (see below).
92///
93/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
94/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
95/// validation cost is linear in the number of underlying bytes.
96///
97/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
98/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
99/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
100/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
101/// _unvalidated_ instance, if desired.
102///
103/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
104/// knows the underlying bytes were already validated previously, or if the caller intends to
105/// perform a small number of (fallible) accesses to a large dictionary.
106///
107/// A _validated_ variant [metadata instance guarantees that:
108///
109/// - header byte is valid
110/// - dictionary size is in bounds
111/// - offset array content is in-bounds
112/// - first offset is zero
113/// - last offset is in-bounds
114/// - all other offsets are in-bounds (*)
115/// - all offsets are monotonically increasing (*)
116/// - all values are valid utf-8 (*)
117///
118/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
119/// in the list above); it panics any of the other checks fails.
120///
121/// # Safety
122///
123/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing
124/// it with infallible methods may cause panics but will never lead to undefined behavior.
125///
126/// [`Variant`]: crate::Variant
127/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
128#[derive(Debug, Clone, PartialEq)]
129pub struct VariantMetadata<'m> {
130 pub(crate) bytes: &'m [u8],
131 header: VariantMetadataHeader,
132 dictionary_size: u32,
133 first_value_byte: u32,
134 validated: bool,
135}
136
137// We don't want this to grow because it increases the size of VariantList and VariantObject, which
138// could increase the size of Variant. All those size increases could hurt performance.
139const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
140
141impl<'m> VariantMetadata<'m> {
142 /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all
143 /// dictionary entries.
144 ///
145 /// [validation]: Self#Validation
146 pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
147 Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
148 }
149
150 /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
151 /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as
152 /// indexing and iteration could also panic if the underlying bytes are invalid.
153 ///
154 /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes
155 /// were already validated previously by other means, or if the caller expects a small number of
156 /// accesses to a large dictionary (preferring to use a small number of fallible accesses as
157 /// needed, instead of paying expensive full validation up front).
158 ///
159 /// [validate]: Self#Validation
160 pub fn new(bytes: &'m [u8]) -> Self {
161 Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
162 }
163
164 // The actual constructor, which performs only basic (constant-const) validation.
165 pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
166 let header_byte = first_byte_from_slice(bytes)?;
167 let header = VariantMetadataHeader::try_new(header_byte)?;
168
169 // First element after header is dictionary size; the offset array immediately follows.
170 let dictionary_size =
171 header
172 .offset_size
173 .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
174
175 // Calculate the starting offset of the dictionary string bytes.
176 //
177 // There are dict_size + 1 offsets, and the value bytes immediately follow
178 // = (dict_size + 1) * offset_size + header.first_offset_byte()
179 let first_value_byte = dictionary_size
180 .checked_add(1)
181 .and_then(|n| n.checked_mul(header.offset_size()))
182 .and_then(|n| n.checked_add(header.first_offset_byte()))
183 .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
184
185 let mut new_self = Self {
186 bytes,
187 header,
188 dictionary_size,
189 first_value_byte,
190 validated: false,
191 };
192
193 // Validate just the first and last offset, ignoring the other offsets and all value bytes.
194 let first_offset = new_self.get_offset(0)?;
195 if first_offset != 0 {
196 return Err(ArrowError::InvalidArgumentError(format!(
197 "First offset is not zero: {first_offset}"
198 )));
199 }
200
201 // Use the last offset to upper-bound the byte slice
202 let last_offset = new_self
203 .get_offset(dictionary_size as _)?
204 .checked_add(first_value_byte)
205 .ok_or_else(|| overflow_error("variant metadata size"))?;
206 new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
207 Ok(new_self)
208 }
209
210 /// The number of metadata dictionary entries
211 pub fn len(&self) -> usize {
212 self.dictionary_size()
213 }
214
215 /// True if this metadata dictionary contains no entries
216 pub fn is_empty(&self) -> bool {
217 self.len() == 0
218 }
219
220 /// True if this instance is fully [validated] for panic-free infallible accesses.
221 ///
222 /// [validated]: Self#Validation
223 pub fn is_fully_validated(&self) -> bool {
224 self.validated
225 }
226
227 /// Performs a full [validation] of this metadata dictionary and returns the result.
228 ///
229 /// [validation]: Self#Validation
230 pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
231 if !self.validated {
232 let offset_bytes = slice_from_slice(
233 self.bytes,
234 self.header.first_offset_byte() as _..self.first_value_byte as _,
235 )?;
236
237 let offsets =
238 map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>();
239
240 // Verify the string values in the dictionary are UTF-8 encoded strings.
241 let value_buffer =
242 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
243
244 if self.header.is_sorted {
245 // Validate the dictionary values are unique and lexicographically sorted
246 //
247 // Since we use the offsets to access dictionary values, this also validates
248 // offsets are in-bounds and monotonically increasing
249 let are_dictionary_values_unique_and_sorted = (1..offsets.len())
250 .map(|i| {
251 let field_range = offsets[i - 1]..offsets[i];
252 value_buffer.get(field_range)
253 })
254 .is_sorted_by(|a, b| match (a, b) {
255 (Some(a), Some(b)) => a < b,
256 _ => false,
257 });
258
259 if !are_dictionary_values_unique_and_sorted {
260 return Err(ArrowError::InvalidArgumentError(
261 "dictionary values are not unique and ordered".to_string(),
262 ));
263 }
264 } else {
265 // Validate offsets are in-bounds and monotonically increasing
266 //
267 // Since shallow validation ensures the first and last offsets are in bounds,
268 // we can also verify all offsets are in-bounds by checking if
269 // offsets are monotonically increasing
270 let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
271 if !are_offsets_monotonic {
272 return Err(ArrowError::InvalidArgumentError(
273 "offsets not monotonically increasing".to_string(),
274 ));
275 }
276 }
277
278 self.validated = true;
279 }
280 Ok(self)
281 }
282
283 /// Whether the dictionary keys are sorted and unique
284 pub fn is_sorted(&self) -> bool {
285 self.header.is_sorted
286 }
287
288 /// Get the dictionary size
289 pub const fn dictionary_size(&self) -> usize {
290 self.dictionary_size as _
291 }
292
293 /// The variant protocol version
294 pub const fn version(&self) -> u8 {
295 self.header.version
296 }
297
298 /// Gets an offset array entry by index.
299 ///
300 /// This offset is an index into the dictionary, at the boundary between string `i-1` and string
301 /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
302 fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
303 let offset_byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
304 let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
305 self.header.offset_size.unpack_u32(bytes, i)
306 }
307
308 /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
309 /// underlying bytes are [invalid].
310 ///
311 /// [invalid]: Self#Validation
312 pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
313 let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as _;
314 string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
315 }
316
317 /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the
318 /// iterator encounters [invalid] data.
319 ///
320 /// [invalid]: Self#Validation
321 pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, ArrowError>> + '_ {
322 (0..self.len()).map(|i| self.get(i))
323 }
324
325 /// Iterates over all dictionary entries. When working with [unvalidated] input, consider
326 /// [`Self::iter_try`] to avoid panics due to invalid data.
327 ///
328 /// [unvalidated]: Self#Validation
329 pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
330 self.iter_try()
331 .map(|result| result.expect("Invalid metadata dictionary entry"))
332 }
333}
334
335/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing
336/// [unvalidated] input could also panic if the underlying bytes are invalid.
337///
338/// [unvalidated]: Self#Validation
339impl std::ops::Index<usize> for VariantMetadata<'_> {
340 type Output = str;
341
342 fn index(&self, i: usize) -> &str {
343 self.get(i).expect("Invalid metadata dictionary entry")
344 }
345}
346
347#[cfg(test)]
348mod tests {
349 use super::*;
350
351 /// `"cat"`, `"dog"` – valid metadata
352 #[test]
353 fn try_new_ok_inline() {
354 let bytes = &[
355 0b0000_0001, // header, offset_size_minus_one=0 and version=1
356 0x02, // dictionary_size (2 strings)
357 0x00,
358 0x03,
359 0x06,
360 b'c',
361 b'a',
362 b't',
363 b'd',
364 b'o',
365 b'g',
366 ];
367
368 let md = VariantMetadata::try_new(bytes).expect("should parse");
369 assert_eq!(md.dictionary_size(), 2);
370 // Fields
371 assert_eq!(&md[0], "cat");
372 assert_eq!(&md[1], "dog");
373
374 // Offsets
375 assert_eq!(md.get_offset(0).unwrap(), 0x00);
376 assert_eq!(md.get_offset(1).unwrap(), 0x03);
377 assert_eq!(md.get_offset(2).unwrap(), 0x06);
378
379 let err = md.get_offset(3).unwrap_err();
380 assert!(
381 matches!(err, ArrowError::InvalidArgumentError(_)),
382 "unexpected error: {err:?}"
383 );
384
385 let fields: Vec<(usize, &str)> = md.iter().enumerate().collect();
386 assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]);
387 }
388
389 /// Too short buffer test (missing one required offset).
390 /// Should error with "metadata shorter than dictionary_size implies".
391 #[test]
392 fn try_new_missing_last_value() {
393 let bytes = &[
394 0b0000_0001, // header, offset_size_minus_one=0 and version=1
395 0x02, // dictionary_size = 2
396 0x00,
397 0x01,
398 0x02,
399 b'a',
400 b'b', // <-- we'll remove this
401 ];
402
403 let working_md = VariantMetadata::try_new(bytes).expect("should parse");
404 assert_eq!(working_md.dictionary_size(), 2);
405 assert_eq!(&working_md[0], "a");
406 assert_eq!(&working_md[1], "b");
407
408 let truncated = &bytes[..bytes.len() - 1];
409
410 let err = VariantMetadata::try_new(truncated).unwrap_err();
411 assert!(
412 matches!(err, ArrowError::InvalidArgumentError(_)),
413 "unexpected error: {err:?}"
414 );
415 }
416
417 #[test]
418 fn try_new_fails_non_monotonic() {
419 // 'cat', 'dog', 'lamb'
420 let bytes = &[
421 0b0000_0001, // header, offset_size_minus_one=0 and version=1
422 0x03, // dictionary_size
423 0x00,
424 0x02,
425 0x01, // Doesn't increase monotonically
426 0x10,
427 b'c',
428 b'a',
429 b't',
430 b'd',
431 b'o',
432 b'g',
433 b'l',
434 b'a',
435 b'm',
436 b'b',
437 ];
438
439 let err = VariantMetadata::try_new(bytes).unwrap_err();
440 assert!(
441 matches!(err, ArrowError::InvalidArgumentError(_)),
442 "unexpected error: {err:?}"
443 );
444 }
445
446 #[test]
447 fn try_new_fails_non_monotonic2() {
448 // this test case checks whether offsets are monotonic in the full validation logic.
449
450 // 'cat', 'dog', 'lamb', "eel"
451 let bytes = &[
452 0b0000_0001, // header, offset_size_minus_one=0 and version=1
453 4, // dictionary_size
454 0x00,
455 0x02,
456 0x01, // Doesn't increase monotonically
457 0x10,
458 13,
459 b'c',
460 b'a',
461 b't',
462 b'd',
463 b'o',
464 b'g',
465 b'l',
466 b'a',
467 b'm',
468 b'b',
469 b'e',
470 b'e',
471 b'l',
472 ];
473
474 let err = VariantMetadata::try_new(bytes).unwrap_err();
475
476 assert!(
477 matches!(err, ArrowError::InvalidArgumentError(_)),
478 "unexpected error: {err:?}"
479 );
480 }
481
482 #[test]
483 fn try_new_truncated_offsets_inline() {
484 // Missing final offset
485 let bytes = &[0b0000_0001, 0x02, 0x00, 0x01];
486
487 let err = VariantMetadata::try_new(bytes).unwrap_err();
488 assert!(
489 matches!(err, ArrowError::InvalidArgumentError(_)),
490 "unexpected error: {err:?}"
491 );
492 }
493}