parquet_variant/variant/
object.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{
20    first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by,
21};
22use crate::variant::{Variant, VariantMetadata};
23
24use arrow_schema::ArrowError;
25
26// The value header occupies one byte; use a named constant for readability
27const NUM_HEADER_BYTES: u32 = 1;
28
29/// Header structure for [`VariantObject`]
30#[derive(Debug, Clone, PartialEq)]
31pub(crate) struct VariantObjectHeader {
32    num_elements_size: OffsetSizeBytes,
33    field_id_size: OffsetSizeBytes,
34    field_offset_size: OffsetSizeBytes,
35}
36
37impl VariantObjectHeader {
38    // Hide the ugly casting
39    const fn num_elements_size(&self) -> u32 {
40        self.num_elements_size as _
41    }
42    const fn field_id_size(&self) -> u32 {
43        self.field_id_size as _
44    }
45    const fn field_offset_size(&self) -> u32 {
46        self.field_offset_size as _
47    }
48
49    // Avoid materializing this offset, since it's cheaply and safely computable
50    const fn field_ids_start_byte(&self) -> u32 {
51        NUM_HEADER_BYTES + self.num_elements_size()
52    }
53
54    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
55        // Parse the header byte to get object parameters
56        let value_header = header_byte >> 2;
57        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
58        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits
59        let is_large = (value_header & 0x10) != 0; // 5th bit
60        let num_elements_size = match is_large {
61            true => OffsetSizeBytes::Four,
62            false => OffsetSizeBytes::One,
63        };
64        Ok(Self {
65            num_elements_size,
66            field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
67            field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
68        })
69    }
70}
71
72/// A [`Variant`] Object (struct with named fields).
73///
74/// See the [Variant spec] file for more information.
75///
76/// # Validation
77///
78/// Every instance of variant object is either _valid_ or _invalid_. depending on whether the
79/// underlying bytes are a valid encoding of a variant object subtype (see below).
80///
81/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully (and recursively)
82/// _validated_. They always contain _valid_ data, and infallible accesses such as iteration and
83/// indexing are panic-free. The validation cost is linear in the number of underlying bytes.
84///
85/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
86/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
87/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
88/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
89/// _unvalidated_ instance, if desired.
90///
91/// _Unvalidated_ instances can be constructed in constant time. They can be useful if the caller
92/// knows the underlying bytes were already validated previously, or if the caller intends to
93/// perform a small number of (fallible) field accesses against a large object.
94///
95/// A _validated_ instance guarantees that:
96///
97/// - header byte is valid
98/// - num_elements is in bounds
99/// - field id array is in bounds
100/// - field offset array is in bounds
101/// - field value array is in bounds
102/// - all field ids are valid metadata dictionary entries (*)
103/// - field ids are lexically ordered according by their corresponding string values (*)
104/// - all field offsets are in bounds (*)
105/// - all field values are (recursively) _valid_ variant values (*)
106/// - the associated variant metadata is [valid] (*)
107///
108/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
109/// in the list above); it panics any of the other checks fails.
110///
111/// # Safety
112///
113/// Even an _invalid_ variant object instance is still _safe_ to use in the Rust sense. Accessing it
114/// with infallible methods may cause panics but will never lead to undefined behavior.
115///
116/// [valid]: VariantMetadata#Validation
117/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
118#[derive(Debug, Clone)]
119pub struct VariantObject<'m, 'v> {
120    pub metadata: VariantMetadata<'m>,
121    pub value: &'v [u8],
122    header: VariantObjectHeader,
123    num_elements: u32,
124    first_field_offset_byte: u32,
125    first_value_byte: u32,
126    validated: bool,
127}
128
129// We don't want this to grow because it could increase the size of `Variant` and hurt performance.
130const _: () = crate::utils::expect_size_of::<VariantObject>(64);
131
132impl<'m, 'v> VariantObject<'m, 'v> {
133    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
134        Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant object")
135    }
136
137    /// Attempts to interpet `metadata` and `value` as a variant object.
138    ///
139    /// # Validation
140    ///
141    /// This constructor verifies that `value` points to a valid variant object value. In
142    /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point
143    /// to valid objects.
144    pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
145        Self::try_new_with_shallow_validation(metadata, value)?.with_full_validation()
146    }
147
148    /// Attempts to interpet `metadata` and `value` as a variant object, performing only basic
149    /// (constant-cost) [validation].
150    ///
151    /// [validation]: Self#Validation
152    pub(crate) fn try_new_with_shallow_validation(
153        metadata: VariantMetadata<'m>,
154        value: &'v [u8],
155    ) -> Result<Self, ArrowError> {
156        let header_byte = first_byte_from_slice(value)?;
157        let header = VariantObjectHeader::try_new(header_byte)?;
158
159        // Determine num_elements size based on is_large flag and fetch the value
160        let num_elements =
161            header
162                .num_elements_size
163                .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
164
165        // Calculate byte offsets for field offsets and values with overflow protection, and verify
166        // they're in bounds
167        let first_field_offset_byte = num_elements
168            .checked_mul(header.field_id_size())
169            .and_then(|n| n.checked_add(header.field_ids_start_byte()))
170            .ok_or_else(|| overflow_error("offset of variant object field offsets"))?;
171
172        let first_value_byte = num_elements
173            .checked_add(1)
174            .and_then(|n| n.checked_mul(header.field_offset_size()))
175            .and_then(|n| n.checked_add(first_field_offset_byte))
176            .ok_or_else(|| overflow_error("offset of variant object field values"))?;
177
178        let mut new_self = Self {
179            metadata,
180            value,
181            header,
182            num_elements,
183            first_field_offset_byte,
184            first_value_byte,
185            validated: false,
186        };
187
188        // Spec says: "The last field_offset points to the byte after the end of the last value"
189        //
190        // Use it to upper-bound the value bytes, which also verifies that the field id and field
191        // offset arrays are in bounds.
192        let last_offset = new_self
193            .get_offset(num_elements as _)?
194            .checked_add(first_value_byte)
195            .ok_or_else(|| overflow_error("variant object size"))?;
196        new_self.value = slice_from_slice(value, ..last_offset as _)?;
197        Ok(new_self)
198    }
199
200    /// True if this instance is fully [validated] for panic-free infallible accesses.
201    ///
202    /// [validated]: Self#Validation
203    pub fn is_fully_validated(&self) -> bool {
204        self.validated
205    }
206
207    /// Performs a full [validation] of this variant object.
208    ///
209    /// [validation]: Self#Validation
210    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
211        if !self.validated {
212            // Validate the metadata dictionary first, if not already validated, because we pass it
213            // by value to all the children (who would otherwise re-validate it repeatedly).
214            self.metadata = self.metadata.with_full_validation()?;
215
216            let field_id_buffer = slice_from_slice(
217                self.value,
218                self.header.field_ids_start_byte() as _..self.first_field_offset_byte as _,
219            )?;
220
221            let mut field_ids_iter =
222                map_bytes_to_offsets(field_id_buffer, self.header.field_id_size);
223
224            // Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted
225            if self.metadata.is_sorted() {
226                // Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names
227                // are lexicographically sorted by their field id ordering
228                let dictionary_size = self.metadata.len();
229
230                if let Some(mut current_id) = field_ids_iter.next() {
231                    for next_id in field_ids_iter {
232                        if current_id >= dictionary_size {
233                            return Err(ArrowError::InvalidArgumentError(
234                                "field id is not valid".to_string(),
235                            ));
236                        }
237
238                        if next_id <= current_id {
239                            return Err(ArrowError::InvalidArgumentError(
240                                "field names not sorted".to_string(),
241                            ));
242                        }
243                        current_id = next_id;
244                    }
245
246                    if current_id >= dictionary_size {
247                        return Err(ArrowError::InvalidArgumentError(
248                            "field id is not valid".to_string(),
249                        ));
250                    }
251                }
252            } else {
253                // The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names
254                // to check lexicographical order
255                //
256                // Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds
257                let mut current_field_name = match field_ids_iter.next() {
258                    Some(field_id) => Some(self.metadata.get(field_id)?),
259                    None => None,
260                };
261
262                for field_id in field_ids_iter {
263                    let next_field_name = self.metadata.get(field_id)?;
264
265                    if let Some(current_name) = current_field_name {
266                        if next_field_name < current_name {
267                            return Err(ArrowError::InvalidArgumentError(
268                                "field names not sorted".to_string(),
269                            ));
270                        }
271                    }
272                    current_field_name = Some(next_field_name);
273                }
274            }
275
276            // Validate whether values are valid variant objects
277            let field_offset_buffer = slice_from_slice(
278                self.value,
279                self.first_field_offset_byte as _..self.first_value_byte as _,
280            )?;
281            let num_offsets = field_offset_buffer.len() / self.header.field_offset_size() as usize;
282
283            let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?;
284
285            map_bytes_to_offsets(field_offset_buffer, self.header.field_offset_size)
286                .take(num_offsets.saturating_sub(1))
287                .try_for_each(|offset| {
288                    let value_bytes = slice_from_slice(value_buffer, offset..)?;
289                    Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?;
290
291                    Ok::<_, ArrowError>(())
292                })?;
293
294            self.validated = true;
295        }
296        Ok(self)
297    }
298
299    /// Returns the number of key-value pairs in this object
300    pub fn len(&self) -> usize {
301        self.num_elements as _
302    }
303
304    /// Returns true if the object contains no key-value pairs
305    pub fn is_empty(&self) -> bool {
306        self.len() == 0
307    }
308
309    /// Get a field's value by index in `0..self.len()`
310    ///
311    /// # Panics
312    ///
313    /// If the index is out of bounds. Also if variant object is corrupted (e.g., invalid offsets or
314    /// field IDs). The latter can only happen when working with an unvalidated object produced by
315    /// [`Self::new`].
316    pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> {
317        (i < self.len()).then(|| {
318            self.try_field_with_shallow_validation(i)
319                .expect("Invalid object field value")
320        })
321    }
322
323    /// Fallible version of `field`. Returns field value by index, capturing validation errors
324    pub fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
325        self.try_field_with_shallow_validation(i)?
326            .with_full_validation()
327    }
328
329    // Attempts to retrieve the ith field value from the value region of the byte buffer; it
330    // performs only basic (constant-cost) validation.
331    fn try_field_with_shallow_validation(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
332        let value_bytes = slice_from_slice(self.value, self.first_value_byte as _..)?;
333        let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)? as _..)?;
334        Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), value_bytes)
335    }
336
337    // Attempts to retrieve the ith offset from the field offset region of the byte buffer.
338    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
339        let byte_range = self.first_field_offset_byte as _..self.first_value_byte as _;
340        let field_offsets = slice_from_slice(self.value, byte_range)?;
341        self.header.field_offset_size.unpack_u32(field_offsets, i)
342    }
343
344    /// Get a field's name by index in `0..self.len()`
345    ///
346    /// # Panics
347    /// If the variant object is corrupted (e.g., invalid offsets or field IDs).
348    /// This should never happen since the constructor validates all data upfront.
349    pub fn field_name(&self, i: usize) -> Option<&'m str> {
350        (i < self.len()).then(|| {
351            self.try_field_name(i)
352                .expect("Invalid variant object field name")
353        })
354    }
355
356    /// Fallible version of `field_name`. Returns field name by index, capturing validation errors
357    fn try_field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
358        let byte_range = self.header.field_ids_start_byte() as _..self.first_field_offset_byte as _;
359        let field_id_bytes = slice_from_slice(self.value, byte_range)?;
360        let field_id = self.header.field_id_size.unpack_u32(field_id_bytes, i)?;
361        self.metadata.get(field_id as _)
362    }
363
364    /// Returns an iterator of (name, value) pairs over the fields of this object.
365    pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> + '_ {
366        self.iter_try_with_shallow_validation()
367            .map(|result| result.expect("Invalid variant object field value"))
368    }
369
370    /// Fallible iteration over the fields of this object.
371    pub fn iter_try(
372        &self,
373    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> + '_ {
374        self.iter_try_with_shallow_validation().map(|result| {
375            let (name, value) = result?;
376            Ok((name, value.with_full_validation()?))
377        })
378    }
379
380    // Fallible iteration over the fields of this object that performs only shallow (constant-cost)
381    // validation of field values.
382    fn iter_try_with_shallow_validation(
383        &self,
384    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> + '_ {
385        (0..self.len()).map(|i| {
386            let field = self.try_field_with_shallow_validation(i)?;
387            Ok((self.try_field_name(i)?, field))
388        })
389    }
390
391    /// Returns the value of the field with the specified name, if any.
392    ///
393    /// `Ok(None)` means the field does not exist; `Err` means the search encountered an error.
394    pub fn get(&self, name: &str) -> Option<Variant<'m, 'v>> {
395        // Binary search through the field IDs of this object to find the requested field name.
396        //
397        // NOTE: This does not require a sorted metadata dictionary, because the variant spec
398        // requires object field ids to be lexically sorted by their corresponding string values,
399        // and probing the dictionary for a field id is always O(1) work.
400        let cmp = |i| Some(self.field_name(i)?.cmp(name));
401        let i = try_binary_search_range_by(0..self.len(), cmp)?.ok()?;
402        self.field(i)
403    }
404}
405
406// Custom implementation of PartialEq for variant objects
407//
408// According to the spec, field values are not required to be in the same order as the field IDs,
409// to enable flexibility when constructing Variant values
410//
411// Instead of comparing the raw bytes of 2 variant objects, this implementation recursively
412// checks whether the field values are equal -- regardless of their order
413impl<'m, 'v> PartialEq for VariantObject<'m, 'v> {
414    fn eq(&self, other: &Self) -> bool {
415        if self.num_elements != other.num_elements {
416            return false;
417        }
418
419        // IFF two objects are valid and logically equal, they will have the same
420        // field names in the same order, because the spec requires the object
421        // fields to be sorted lexicographically.
422        self.iter()
423            .zip(other.iter())
424            .all(|((name_a, value_a), (name_b, value_b))| name_a == name_b && value_a == value_b)
425    }
426}
427
428#[cfg(test)]
429mod tests {
430    use crate::VariantBuilder;
431
432    use super::*;
433
434    #[test]
435    fn test_variant_object_simple() {
436        // Create metadata with field names: "age", "name", "active" (sorted)
437        // Header: version=1, sorted=1, offset_size=1 (offset_size_minus_one=0)
438        // So header byte = 00_0_1_0001 = 0x11
439        let metadata_bytes = vec![
440            0b0001_0001,
441            3, // dictionary size
442            0, // "active"
443            6, // "age"
444            9, // "name"
445            13,
446            b'a',
447            b'c',
448            b't',
449            b'i',
450            b'v',
451            b'e',
452            b'a',
453            b'g',
454            b'e',
455            b'n',
456            b'a',
457            b'm',
458            b'e',
459        ];
460        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
461
462        // Create object value data for: {"active": true, "age": 42, "name": "hello"}
463        // Field IDs in sorted order: [0, 1, 2] (active, age, name)
464        // Header: basic_type=2, field_offset_size_minus_one=0, field_id_size_minus_one=0, is_large=0
465        // value_header = 0000_00_00 = 0x00
466        // So header byte = (0x00 << 2) | 2 = 0x02
467        let object_value = vec![
468            0x02, // header: basic_type=2, value_header=0x00
469            3,    // num_elements = 3
470            // Field IDs (1 byte each): active=0, age=1, name=2
471            0, 1, 2,
472            // Field offsets (1 byte each): 4 offsets total
473            0, // offset to first value (boolean true)
474            1, // offset to second value (int8)
475            3, // offset to third value (short string)
476            9, // end offset
477            // Values:
478            0x04, // boolean true: primitive_header=1, basic_type=0 -> (1 << 2) | 0 = 0x04
479            0x0C,
480            42, // int8: primitive_header=3, basic_type=0 -> (3 << 2) | 0 = 0x0C, then value 42
481            0x15, b'h', b'e', b'l', b'l',
482            b'o', // short string: length=5, basic_type=1 -> (5 << 2) | 1 = 0x15
483        ];
484
485        let variant_obj = VariantObject::try_new(metadata, &object_value).unwrap();
486
487        // Test basic properties
488        assert_eq!(variant_obj.len(), 3);
489        assert!(!variant_obj.is_empty());
490
491        // Test field access
492        let active_field = variant_obj.get("active");
493        assert!(active_field.is_some());
494        assert_eq!(active_field.unwrap().as_boolean(), Some(true));
495
496        let age_field = variant_obj.get("age");
497        assert!(age_field.is_some());
498        assert_eq!(age_field.unwrap().as_int8(), Some(42));
499
500        let name_field = variant_obj.get("name");
501        assert!(name_field.is_some());
502        assert_eq!(name_field.unwrap().as_string(), Some("hello"));
503
504        // Test non-existent field
505        let missing_field = variant_obj.get("missing");
506        assert!(missing_field.is_none());
507
508        let missing_field_name = variant_obj.field_name(3);
509        assert!(missing_field_name.is_none());
510
511        let missing_field_name = variant_obj.field_name(300);
512        assert!(missing_field_name.is_none());
513
514        let missing_field_value = variant_obj.field(3);
515        assert!(missing_field_value.is_none());
516
517        let missing_field_value = variant_obj.field(300);
518        assert!(missing_field_value.is_none());
519
520        // Test fields iterator
521        let fields: Vec<_> = variant_obj.iter().collect();
522        assert_eq!(fields.len(), 3);
523
524        // Fields should be in sorted order: active, age, name
525        assert_eq!(fields[0].0, "active");
526        assert_eq!(fields[0].1.as_boolean(), Some(true));
527
528        assert_eq!(fields[1].0, "age");
529        assert_eq!(fields[1].1.as_int8(), Some(42));
530
531        assert_eq!(fields[2].0, "name");
532        assert_eq!(fields[2].1.as_string(), Some("hello"));
533
534        // Test field access by index
535        // Fields should be in sorted order: active, age, name
536        assert_eq!(variant_obj.field_name(0), Some("active"));
537        assert_eq!(variant_obj.field(0).unwrap().as_boolean(), Some(true));
538
539        assert_eq!(variant_obj.field_name(1), Some("age"));
540        assert_eq!(variant_obj.field(1).unwrap().as_int8(), Some(42));
541
542        assert_eq!(variant_obj.field_name(2), Some("name"));
543        assert_eq!(variant_obj.field(2).unwrap().as_string(), Some("hello"));
544    }
545
546    #[test]
547    fn test_variant_object_empty_fields() {
548        let mut builder = VariantBuilder::new();
549        builder.new_object().with_field("", 42).finish();
550        let (metadata, value) = builder.finish();
551
552        // Resulting object is valid and has a single empty field
553        let variant = Variant::try_new(&metadata, &value).unwrap();
554        let variant_obj = variant.as_object().unwrap();
555        assert_eq!(variant_obj.len(), 1);
556        assert_eq!(variant_obj.get(""), Some(Variant::from(42)));
557    }
558
559    #[test]
560    fn test_variant_object_empty() {
561        // Create metadata with no fields
562        let metadata_bytes = vec![
563            0x11, // header: version=1, sorted=0, offset_size_minus_one=0
564            0,    // dictionary_size = 0
565            0,    // offset[0] = 0 (end of dictionary)
566        ];
567        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
568
569        // Create empty object value data: {}
570        let object_value = vec![
571            0x02, // header: basic_type=2, value_header=0x00
572            0,    // num_elements = 0
573            0,    // single offset pointing to end
574                  // No field IDs, no values
575        ];
576
577        let variant_obj = VariantObject::try_new(metadata, &object_value).unwrap();
578
579        // Test basic properties
580        assert_eq!(variant_obj.len(), 0);
581        assert!(variant_obj.is_empty());
582
583        // Test field access on empty object
584        let missing_field = variant_obj.get("anything");
585        assert!(missing_field.is_none());
586
587        // Test fields iterator on empty object
588        let fields: Vec<_> = variant_obj.iter().collect();
589        assert_eq!(fields.len(), 0);
590    }
591
592    #[test]
593    fn test_variant_object_invalid_metadata_end_offset() {
594        // Create metadata with field names: "age", "name" (sorted)
595        let metadata_bytes = vec![
596            0b0001_0001, // header: version=1, sorted=1, offset_size_minus_one=0
597            2,           // dictionary size
598            0,           // "age"
599            3,           // "name"
600            8,           // Invalid end offset (should be 7)
601            b'a',
602            b'g',
603            b'e',
604            b'n',
605            b'a',
606            b'm',
607            b'e',
608        ];
609        let err = VariantMetadata::try_new(&metadata_bytes);
610        let err = err.unwrap_err();
611        assert!(matches!(
612            err,
613            ArrowError::InvalidArgumentError(ref msg) if msg.contains("Tried to extract byte(s) ..13 from 12-byte buffer")
614        ));
615    }
616
617    #[test]
618    fn test_variant_object_invalid_end_offset() {
619        // Create metadata with field names: "age", "name" (sorted)
620        let metadata_bytes = vec![
621            0b0001_0001, // header: version=1, sorted=1, offset_size_minus_one=0
622            2,           // dictionary size
623            0,           // "age"
624            3,           // "name"
625            7,
626            b'a',
627            b'g',
628            b'e',
629            b'n',
630            b'a',
631            b'm',
632            b'e',
633        ];
634        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
635
636        // Create object value data for: {"age": 42, "name": "hello"}
637        // Field IDs in sorted order: [0, 1] (age, name)
638        // Header: basic_type=2, field_offset_size_minus_one=0, field_id_size_minus_one=0, is_large=0
639        // value_header = 0000_00_00 = 0x00
640        let object_value = vec![
641            0x02, // header: basic_type=2, value_header=0x00
642            2,    // num_elements = 2
643            // Field IDs (1 byte each): age=0, name=1
644            0, 1,
645            // Field offsets (1 byte each): 3 offsets total
646            0, // offset to first value (int8)
647            2, // offset to second value (short string)
648            9, // invalid end offset (correct would be 8)
649            // Values:
650            0x0C,
651            42, // int8: primitive_header=3, basic_type=0 -> (3 << 2) | 0 = 0x0C, then value 42
652            0x15, b'h', b'e', b'l', b'l',
653            b'o', // short string: length=5, basic_type=1 -> (5 << 2) | 1 = 0x15
654        ];
655
656        let err = VariantObject::try_new(metadata, &object_value);
657        let err = err.unwrap_err();
658        assert!(matches!(
659            err,
660            ArrowError::InvalidArgumentError(ref msg) if msg.contains("Tried to extract byte(s) ..16 from 15-byte buffer")
661        ));
662    }
663
664    fn test_variant_object_with_count(count: i32, expected_field_id_size: OffsetSizeBytes) {
665        let field_names: Vec<_> = (0..count).map(|val| val.to_string()).collect();
666        let mut builder =
667            VariantBuilder::new().with_field_names(field_names.iter().map(|s| s.as_str()));
668
669        let mut obj = builder.new_object();
670
671        for i in 0..count {
672            obj.insert(&field_names[i as usize], i);
673        }
674
675        obj.finish();
676        let (metadata, value) = builder.finish();
677        let variant = Variant::new(&metadata, &value);
678
679        if let Variant::Object(obj) = variant {
680            assert_eq!(obj.len(), count as usize);
681
682            assert_eq!(obj.get(&field_names[0]).unwrap(), Variant::Int32(0));
683            assert_eq!(
684                obj.get(&field_names[(count - 1) as usize]).unwrap(),
685                Variant::Int32(count - 1)
686            );
687            assert_eq!(
688                obj.header.field_id_size, expected_field_id_size,
689                "Expected {}-byte field IDs, got {}-byte field IDs",
690                expected_field_id_size as usize, obj.header.field_id_size as usize
691            );
692        } else {
693            panic!("Expected object variant");
694        }
695    }
696
697    #[test]
698    fn test_variant_object_257_elements() {
699        test_variant_object_with_count((1 << 8) + 1, OffsetSizeBytes::Two); // 2^8 + 1, expected 2-byte field IDs
700    }
701
702    #[test]
703    fn test_variant_object_65537_elements() {
704        test_variant_object_with_count((1 << 16) + 1, OffsetSizeBytes::Three);
705        // 2^16 + 1, expected 3-byte field IDs
706    }
707
708    /* Can't run this test now as it takes 45x longer than other tests
709    #[test]
710    fn test_variant_object_16777217_elements() {
711        test_variant_object_with_count((1 << 24) + 1, OffsetSizeBytes::Four);
712        // 2^24 + 1, expected 4-byte field IDs
713    }
714     */
715
716    #[test]
717    fn test_variant_object_small_sizes_255_elements() {
718        test_variant_object_with_count(255, OffsetSizeBytes::One);
719    }
720
721    fn test_variant_object_with_large_data(
722        data_size_per_field: usize,
723        expected_field_offset_size: OffsetSizeBytes,
724    ) {
725        let num_fields = 20;
726        let mut builder = VariantBuilder::new();
727        let mut obj = builder.new_object();
728
729        let str_val = "a".repeat(data_size_per_field);
730
731        for val in 0..num_fields {
732            let key = format!("id_{val}");
733            obj.insert(&key, str_val.as_str());
734        }
735
736        obj.finish();
737        let (metadata, value) = builder.finish();
738        let variant = Variant::new(&metadata, &value);
739
740        if let Variant::Object(obj) = variant {
741            assert_eq!(obj.len(), num_fields);
742            assert_eq!(
743                obj.header.field_offset_size, expected_field_offset_size,
744                "Expected {}-byte field offsets, got {}-byte field offsets",
745                expected_field_offset_size as usize, obj.header.field_offset_size as usize
746            );
747        } else {
748            panic!("Expected object variant");
749        }
750    }
751
752    #[test]
753    fn test_variant_object_child_data_0_byte_offsets_minus_one() {
754        test_variant_object_with_large_data(10, OffsetSizeBytes::One);
755    }
756
757    #[test]
758    fn test_variant_object_256_bytes_child_data_3_byte_offsets() {
759        test_variant_object_with_large_data(256 + 1, OffsetSizeBytes::Two); // 2^8 - 2^16 elements
760    }
761
762    #[test]
763    fn test_variant_object_16777216_bytes_child_data_4_byte_offsets() {
764        test_variant_object_with_large_data(65536 + 1, OffsetSizeBytes::Three); // 2^16 - 2^24 elements
765    }
766
767    #[test]
768    fn test_variant_object_65535_bytes_child_data_2_byte_offsets() {
769        test_variant_object_with_large_data(16777216 + 1, OffsetSizeBytes::Four);
770        // 2^24
771    }
772
773    #[test]
774    fn test_objects_with_same_fields_are_equal() {
775        let mut b = VariantBuilder::new();
776        let mut o = b.new_object();
777
778        o.insert("b", ());
779        o.insert("c", ());
780        o.insert("a", ());
781
782        o.finish();
783
784        let (m, v) = b.finish();
785
786        let v1 = Variant::try_new(&m, &v).unwrap();
787        let v2 = Variant::try_new(&m, &v).unwrap();
788
789        assert_eq!(v1, v2);
790    }
791
792    #[test]
793    fn test_same_objects_with_different_builder_are_equal() {
794        let mut b = VariantBuilder::new();
795        let mut o = b.new_object();
796
797        o.insert("a", ());
798        o.insert("b", false);
799
800        o.finish();
801        let (m, v) = b.finish();
802
803        let v1 = Variant::try_new(&m, &v).unwrap();
804
805        let mut b = VariantBuilder::new();
806        let mut o = b.new_object();
807
808        o.insert("a", ());
809        o.insert("b", false);
810
811        o.finish();
812        let (m, v) = b.finish();
813
814        let v2 = Variant::try_new(&m, &v).unwrap();
815
816        assert_eq!(v1, v2);
817    }
818
819    #[test]
820    fn test_objects_with_different_values_are_not_equal() {
821        let mut b = VariantBuilder::new();
822        let mut o = b.new_object();
823
824        o.insert("a", ());
825        o.insert("b", 4.3);
826
827        o.finish();
828
829        let (m, v) = b.finish();
830
831        let v1 = Variant::try_new(&m, &v).unwrap();
832
833        // second object, same field name but different values
834        let mut b = VariantBuilder::new();
835        let mut o = b.new_object();
836
837        o.insert("a", ());
838        let mut inner_o = o.new_object("b");
839        inner_o.insert("a", 3.3);
840        inner_o.finish();
841        o.finish();
842
843        let (m, v) = b.finish();
844
845        let v2 = Variant::try_new(&m, &v).unwrap();
846
847        let m1 = v1.metadata();
848        let m2 = v2.metadata();
849
850        // metadata would be equal since they contain the same keys
851        assert_eq!(m1, m2);
852
853        // but the objects are not equal
854        assert_ne!(v1, v2);
855    }
856
857    #[test]
858    fn test_objects_with_different_field_names_are_not_equal() {
859        let mut b = VariantBuilder::new();
860        let mut o = b.new_object();
861
862        o.insert("a", ());
863        o.insert("b", 4.3);
864
865        o.finish();
866
867        let (m, v) = b.finish();
868
869        let v1 = Variant::try_new(&m, &v).unwrap();
870
871        // second object, same field name but different values
872        let mut b = VariantBuilder::new();
873        let mut o = b.new_object();
874
875        o.insert("aardvark", ());
876        o.insert("barracuda", 3.3);
877
878        o.finish();
879
880        let (m, v) = b.finish();
881        let v2 = Variant::try_new(&m, &v).unwrap();
882
883        assert_ne!(v1, v2);
884    }
885
886    #[test]
887    fn test_objects_with_different_insertion_order_are_equal() {
888        let mut b = VariantBuilder::new();
889        let mut o = b.new_object();
890
891        o.insert("b", false);
892        o.insert("a", ());
893
894        o.finish();
895
896        let (m, v) = b.finish();
897
898        let v1 = Variant::try_new(&m, &v).unwrap();
899        assert!(!v1.metadata().is_sorted());
900
901        // create another object pre-filled with field names, b and a
902        // but insert the fields in the order of a, b
903        let mut b = VariantBuilder::new().with_field_names(["b", "a"]);
904        let mut o = b.new_object();
905
906        o.insert("a", ());
907        o.insert("b", false);
908
909        o.finish();
910
911        let (m, v) = b.finish();
912
913        let v2 = Variant::try_new(&m, &v).unwrap();
914
915        // v2 should also have a unsorted dictionary
916        assert!(!v2.metadata().is_sorted());
917
918        assert_eq!(v1, v2);
919    }
920
921    #[test]
922    fn test_objects_with_differing_metadata_are_equal() {
923        let mut b = VariantBuilder::new();
924        let mut o = b.new_object();
925
926        o.insert("a", ());
927        o.insert("b", 4.3);
928
929        o.finish();
930
931        let (meta1, value1) = b.finish();
932
933        let v1 = Variant::try_new(&meta1, &value1).unwrap();
934        // v1 is sorted
935        assert!(v1.metadata().is_sorted());
936
937        // create a second object with different insertion order
938        let mut b = VariantBuilder::new().with_field_names(["d", "c", "b", "a"]);
939        let mut o = b.new_object();
940
941        o.insert("b", 4.3);
942        o.insert("a", ());
943
944        o.finish();
945
946        let (meta2, value2) = b.finish();
947
948        let v2 = Variant::try_new(&meta2, &value2).unwrap();
949        // v2 is not sorted
950        assert!(!v2.metadata().is_sorted());
951
952        // object metadata are not the same
953        assert_ne!(v1.metadata(), v2.metadata());
954
955        // objects are still logically equal
956        assert_eq!(v1, v2);
957    }
958
959    #[test]
960    fn test_compare_object_with_unsorted_dictionary_vs_sorted_dictionary() {
961        // create a sorted object
962        let mut b = VariantBuilder::new();
963        let mut o = b.new_object();
964
965        o.insert("a", false);
966        o.insert("b", false);
967
968        o.finish();
969
970        let (m, v) = b.finish();
971
972        let v1 = Variant::try_new(&m, &v).unwrap();
973
974        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
975        // Since field names are not unique, it is considered not sorted.
976        let metadata_bytes = vec![
977            0b0000_0001,
978            3, // dictionary size
979            0, // "a"
980            1, // "b"
981            2, // "a"
982            3,
983            b'a',
984            b'b',
985            b'a',
986        ];
987        let m = VariantMetadata::try_new(&metadata_bytes).unwrap();
988        assert!(!m.is_sorted());
989
990        let v2 = Variant::new_with_metadata(m, &v);
991        assert_eq!(v1, v2);
992    }
993}