parquet_variant/variant/
object.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
19use crate::utils::{
20    first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by,
21};
22use crate::variant::{Variant, VariantMetadata};
23
24use arrow_schema::ArrowError;
25
26// The value header occupies one byte; use a named constant for readability
27const NUM_HEADER_BYTES: u32 = 1;
28
29/// Header structure for [`VariantObject`]
30#[derive(Debug, Clone, PartialEq)]
31pub(crate) struct VariantObjectHeader {
32    num_elements_size: OffsetSizeBytes,
33    field_id_size: OffsetSizeBytes,
34    field_offset_size: OffsetSizeBytes,
35}
36
37impl VariantObjectHeader {
38    // Hide the ugly casting
39    const fn num_elements_size(&self) -> u32 {
40        self.num_elements_size as _
41    }
42    const fn field_id_size(&self) -> u32 {
43        self.field_id_size as _
44    }
45    const fn field_offset_size(&self) -> u32 {
46        self.field_offset_size as _
47    }
48
49    // Avoid materializing this offset, since it's cheaply and safely computable
50    const fn field_ids_start_byte(&self) -> u32 {
51        NUM_HEADER_BYTES + self.num_elements_size()
52    }
53
54    pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
55        // Parse the header byte to get object parameters
56        let value_header = header_byte >> 2;
57        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
58        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits
59        let is_large = (value_header & 0x10) != 0; // 5th bit
60        let num_elements_size = match is_large {
61            true => OffsetSizeBytes::Four,
62            false => OffsetSizeBytes::One,
63        };
64        Ok(Self {
65            num_elements_size,
66            field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
67            field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
68        })
69    }
70}
71
72/// A [`Variant`] Object (struct with named fields).
73///
74/// See the [Variant spec] file for more information.
75///
76/// # Validation
77///
78/// Every instance of variant object is either _valid_ or _invalid_. depending on whether the
79/// underlying bytes are a valid encoding of a variant object subtype (see below).
80///
81/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully (and recursively)
82/// _validated_. They always contain _valid_ data, and infallible accesses such as iteration and
83/// indexing are panic-free. The validation cost is linear in the number of underlying bytes.
84///
85/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
86/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
87/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
88/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
89/// _unvalidated_ instance, if desired.
90///
91/// _Unvalidated_ instances can be constructed in constant time. They can be useful if the caller
92/// knows the underlying bytes were already validated previously, or if the caller intends to
93/// perform a small number of (fallible) field accesses against a large object.
94///
95/// A _validated_ instance guarantees that:
96///
97/// - header byte is valid
98/// - num_elements is in bounds
99/// - field id array is in bounds
100/// - field offset array is in bounds
101/// - field value array is in bounds
102/// - all field ids are valid metadata dictionary entries (*)
103/// - field ids are lexically ordered according by their corresponding string values (*)
104/// - all field offsets are in bounds (*)
105/// - all field values are (recursively) _valid_ variant values (*)
106/// - the associated variant metadata is [valid] (*)
107///
108/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)`
109/// in the list above); it panics any of the other checks fails.
110///
111/// # Safety
112///
113/// Even an _invalid_ variant object instance is still _safe_ to use in the Rust sense. Accessing it
114/// with infallible methods may cause panics but will never lead to undefined behavior.
115///
116/// [valid]: VariantMetadata#Validation
117/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
118#[derive(Debug, Clone)]
119pub struct VariantObject<'m, 'v> {
120    pub metadata: VariantMetadata<'m>,
121    pub value: &'v [u8],
122    header: VariantObjectHeader,
123    num_elements: u32,
124    first_field_offset_byte: u32,
125    first_value_byte: u32,
126    validated: bool,
127}
128
129// We don't want this to grow because it could increase the size of `Variant` and hurt performance.
130const _: () = crate::utils::expect_size_of::<VariantObject>(64);
131
132impl<'m, 'v> VariantObject<'m, 'v> {
133    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
134        Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant object")
135    }
136
137    /// Attempts to interpet `metadata` and `value` as a variant object.
138    ///
139    /// # Validation
140    ///
141    /// This constructor verifies that `value` points to a valid variant object value. In
142    /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point
143    /// to valid objects.
144    pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
145        Self::try_new_with_shallow_validation(metadata, value)?.with_full_validation()
146    }
147
148    /// Attempts to interpet `metadata` and `value` as a variant object, performing only basic
149    /// (constant-cost) [validation].
150    ///
151    /// [validation]: Self#Validation
152    pub(crate) fn try_new_with_shallow_validation(
153        metadata: VariantMetadata<'m>,
154        value: &'v [u8],
155    ) -> Result<Self, ArrowError> {
156        let header_byte = first_byte_from_slice(value)?;
157        let header = VariantObjectHeader::try_new(header_byte)?;
158
159        // Determine num_elements size based on is_large flag and fetch the value
160        let num_elements =
161            header
162                .num_elements_size
163                .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
164
165        // Calculate byte offsets for field offsets and values with overflow protection, and verify
166        // they're in bounds
167        let first_field_offset_byte = num_elements
168            .checked_mul(header.field_id_size())
169            .and_then(|n| n.checked_add(header.field_ids_start_byte()))
170            .ok_or_else(|| overflow_error("offset of variant object field offsets"))?;
171
172        let first_value_byte = num_elements
173            .checked_add(1)
174            .and_then(|n| n.checked_mul(header.field_offset_size()))
175            .and_then(|n| n.checked_add(first_field_offset_byte))
176            .ok_or_else(|| overflow_error("offset of variant object field values"))?;
177
178        let mut new_self = Self {
179            metadata,
180            value,
181            header,
182            num_elements,
183            first_field_offset_byte,
184            first_value_byte,
185            validated: false,
186        };
187
188        // Spec says: "The last field_offset points to the byte after the end of the last value"
189        //
190        // Use it to upper-bound the value bytes, which also verifies that the field id and field
191        // offset arrays are in bounds.
192        let last_offset = new_self
193            .get_offset(num_elements as _)?
194            .checked_add(first_value_byte)
195            .ok_or_else(|| overflow_error("variant object size"))?;
196        new_self.value = slice_from_slice(value, ..last_offset as _)?;
197        Ok(new_self)
198    }
199
200    /// True if this instance is fully [validated] for panic-free infallible accesses.
201    ///
202    /// [validated]: Self#Validation
203    pub fn is_fully_validated(&self) -> bool {
204        self.validated
205    }
206
207    /// Performs a full [validation] of this variant object.
208    ///
209    /// [validation]: Self#Validation
210    pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
211        if !self.validated {
212            // Validate the metadata dictionary first, if not already validated, because we pass it
213            // by value to all the children (who would otherwise re-validate it repeatedly).
214            self.metadata = self.metadata.with_full_validation()?;
215
216            let field_id_buffer = slice_from_slice(
217                self.value,
218                self.header.field_ids_start_byte() as _..self.first_field_offset_byte as _,
219            )?;
220
221            let mut field_ids_iter =
222                map_bytes_to_offsets(field_id_buffer, self.header.field_id_size);
223
224            // Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted
225            if self.metadata.is_sorted() {
226                // Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names
227                // are lexicographically sorted by their field id ordering
228                let dictionary_size = self.metadata.len();
229
230                if let Some(mut current_id) = field_ids_iter.next() {
231                    for next_id in field_ids_iter {
232                        if current_id >= dictionary_size {
233                            return Err(ArrowError::InvalidArgumentError(
234                                "field id is not valid".to_string(),
235                            ));
236                        }
237
238                        if next_id <= current_id {
239                            return Err(ArrowError::InvalidArgumentError(
240                                "field names not sorted".to_string(),
241                            ));
242                        }
243                        current_id = next_id;
244                    }
245
246                    if current_id >= dictionary_size {
247                        return Err(ArrowError::InvalidArgumentError(
248                            "field id is not valid".to_string(),
249                        ));
250                    }
251                }
252            } else {
253                // The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names
254                // to check lexicographical order
255                //
256                // Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds
257                let mut current_field_name = match field_ids_iter.next() {
258                    Some(field_id) => Some(self.metadata.get(field_id)?),
259                    None => None,
260                };
261
262                for field_id in field_ids_iter {
263                    let next_field_name = self.metadata.get(field_id)?;
264
265                    if let Some(current_name) = current_field_name {
266                        if next_field_name < current_name {
267                            return Err(ArrowError::InvalidArgumentError(
268                                "field names not sorted".to_string(),
269                            ));
270                        }
271                    }
272                    current_field_name = Some(next_field_name);
273                }
274            }
275
276            // Validate whether values are valid variant objects
277            let field_offset_buffer = slice_from_slice(
278                self.value,
279                self.first_field_offset_byte as _..self.first_value_byte as _,
280            )?;
281            let num_offsets = field_offset_buffer.len() / self.header.field_offset_size() as usize;
282
283            let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?;
284
285            map_bytes_to_offsets(field_offset_buffer, self.header.field_offset_size)
286                .take(num_offsets.saturating_sub(1))
287                .try_for_each(|offset| {
288                    let value_bytes = slice_from_slice(value_buffer, offset..)?;
289                    Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?;
290
291                    Ok::<_, ArrowError>(())
292                })?;
293
294            self.validated = true;
295        }
296        Ok(self)
297    }
298
299    /// Returns the number of key-value pairs in this object
300    pub fn len(&self) -> usize {
301        self.num_elements as _
302    }
303
304    /// Returns true if the object contains no key-value pairs
305    pub fn is_empty(&self) -> bool {
306        self.len() == 0
307    }
308
309    /// Get a field's value by index in `0..self.len()`
310    ///
311    /// # Panics
312    ///
313    /// If the index is out of bounds. Also if variant object is corrupted (e.g., invalid offsets or
314    /// field IDs). The latter can only happen when working with an unvalidated object produced by
315    /// [`Self::new`].
316    pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> {
317        (i < self.len()).then(|| {
318            self.try_field_with_shallow_validation(i)
319                .expect("Invalid object field value")
320        })
321    }
322
323    /// Fallible version of `field`. Returns field value by index, capturing validation errors
324    pub fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
325        self.try_field_with_shallow_validation(i)?
326            .with_full_validation()
327    }
328
329    // Attempts to retrieve the ith field value from the value region of the byte buffer; it
330    // performs only basic (constant-cost) validation.
331    fn try_field_with_shallow_validation(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
332        let value_bytes = slice_from_slice(self.value, self.first_value_byte as _..)?;
333        let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)? as _..)?;
334        Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), value_bytes)
335    }
336
337    // Attempts to retrieve the ith offset from the field offset region of the byte buffer.
338    fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
339        let byte_range = self.first_field_offset_byte as _..self.first_value_byte as _;
340        let field_offsets = slice_from_slice(self.value, byte_range)?;
341        self.header.field_offset_size.unpack_u32(field_offsets, i)
342    }
343
344    /// Get a field's name by index in `0..self.len()`
345    ///
346    /// # Panics
347    /// If the variant object is corrupted (e.g., invalid offsets or field IDs).
348    /// This should never happen since the constructor validates all data upfront.
349    pub fn field_name(&self, i: usize) -> Option<&'m str> {
350        (i < self.len()).then(|| {
351            self.try_field_name(i)
352                .expect("Invalid variant object field name")
353        })
354    }
355
356    /// Fallible version of `field_name`. Returns field name by index, capturing validation errors
357    fn try_field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
358        let byte_range = self.header.field_ids_start_byte() as _..self.first_field_offset_byte as _;
359        let field_id_bytes = slice_from_slice(self.value, byte_range)?;
360        let field_id = self.header.field_id_size.unpack_u32(field_id_bytes, i)?;
361        self.metadata.get(field_id as _)
362    }
363
364    /// Returns an iterator of (name, value) pairs over the fields of this object.
365    pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> + '_ {
366        self.iter_try_with_shallow_validation()
367            .map(|result| result.expect("Invalid variant object field value"))
368    }
369
370    /// Fallible iteration over the fields of this object.
371    pub fn iter_try(
372        &self,
373    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> + '_ {
374        self.iter_try_with_shallow_validation().map(|result| {
375            let (name, value) = result?;
376            Ok((name, value.with_full_validation()?))
377        })
378    }
379
380    // Fallible iteration over the fields of this object that performs only shallow (constant-cost)
381    // validation of field values.
382    fn iter_try_with_shallow_validation(
383        &self,
384    ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> + '_ {
385        (0..self.len()).map(|i| {
386            let field = self.try_field_with_shallow_validation(i)?;
387            Ok((self.try_field_name(i)?, field))
388        })
389    }
390
391    /// Returns the value of the field with the specified name, if any.
392    ///
393    /// `Ok(None)` means the field does not exist; `Err` means the search encountered an error.
394    pub fn get(&self, name: &str) -> Option<Variant<'m, 'v>> {
395        // Binary search through the field IDs of this object to find the requested field name.
396        //
397        // NOTE: This does not require a sorted metadata dictionary, because the variant spec
398        // requires object field ids to be lexically sorted by their corresponding string values,
399        // and probing the dictionary for a field id is always O(1) work.
400        let i = try_binary_search_range_by(0..self.len(), &name, |i| self.field_name(i))?.ok()?;
401
402        self.field(i)
403    }
404}
405
406// Custom implementation of PartialEq for variant objects
407//
408// According to the spec, field values are not required to be in the same order as the field IDs,
409// to enable flexibility when constructing Variant values
410//
411// Instead of comparing the raw bytes of 2 variant objects, this implementation recursively
412// checks whether the field values are equal -- regardless of their order
413impl<'m, 'v> PartialEq for VariantObject<'m, 'v> {
414    fn eq(&self, other: &Self) -> bool {
415        if self.num_elements != other.num_elements {
416            return false;
417        }
418
419        // IFF two objects are valid and logically equal, they will have the same
420        // field names in the same order, because the spec requires the object
421        // fields to be sorted lexicographically.
422        for ((name_a, value_a), (name_b, value_b)) in self.iter().zip(other.iter()) {
423            if name_a != name_b || value_a != value_b {
424                return false;
425            }
426        }
427
428        true
429    }
430}
431
432#[cfg(test)]
433mod tests {
434    use crate::VariantBuilder;
435
436    use super::*;
437
438    #[test]
439    fn test_variant_object_simple() {
440        // Create metadata with field names: "age", "name", "active" (sorted)
441        // Header: version=1, sorted=1, offset_size=1 (offset_size_minus_one=0)
442        // So header byte = 00_0_1_0001 = 0x11
443        let metadata_bytes = vec![
444            0b0001_0001,
445            3, // dictionary size
446            0, // "active"
447            6, // "age"
448            9, // "name"
449            13,
450            b'a',
451            b'c',
452            b't',
453            b'i',
454            b'v',
455            b'e',
456            b'a',
457            b'g',
458            b'e',
459            b'n',
460            b'a',
461            b'm',
462            b'e',
463        ];
464        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
465
466        // Create object value data for: {"active": true, "age": 42, "name": "hello"}
467        // Field IDs in sorted order: [0, 1, 2] (active, age, name)
468        // Header: basic_type=2, field_offset_size_minus_one=0, field_id_size_minus_one=0, is_large=0
469        // value_header = 0000_00_00 = 0x00
470        // So header byte = (0x00 << 2) | 2 = 0x02
471        let object_value = vec![
472            0x02, // header: basic_type=2, value_header=0x00
473            3,    // num_elements = 3
474            // Field IDs (1 byte each): active=0, age=1, name=2
475            0, 1, 2,
476            // Field offsets (1 byte each): 4 offsets total
477            0, // offset to first value (boolean true)
478            1, // offset to second value (int8)
479            3, // offset to third value (short string)
480            9, // end offset
481            // Values:
482            0x04, // boolean true: primitive_header=1, basic_type=0 -> (1 << 2) | 0 = 0x04
483            0x0C,
484            42, // int8: primitive_header=3, basic_type=0 -> (3 << 2) | 0 = 0x0C, then value 42
485            0x15, b'h', b'e', b'l', b'l',
486            b'o', // short string: length=5, basic_type=1 -> (5 << 2) | 1 = 0x15
487        ];
488
489        let variant_obj = VariantObject::try_new(metadata, &object_value).unwrap();
490
491        // Test basic properties
492        assert_eq!(variant_obj.len(), 3);
493        assert!(!variant_obj.is_empty());
494
495        // Test field access
496        let active_field = variant_obj.get("active");
497        assert!(active_field.is_some());
498        assert_eq!(active_field.unwrap().as_boolean(), Some(true));
499
500        let age_field = variant_obj.get("age");
501        assert!(age_field.is_some());
502        assert_eq!(age_field.unwrap().as_int8(), Some(42));
503
504        let name_field = variant_obj.get("name");
505        assert!(name_field.is_some());
506        assert_eq!(name_field.unwrap().as_string(), Some("hello"));
507
508        // Test non-existent field
509        let missing_field = variant_obj.get("missing");
510        assert!(missing_field.is_none());
511
512        let missing_field_name = variant_obj.field_name(3);
513        assert!(missing_field_name.is_none());
514
515        let missing_field_name = variant_obj.field_name(300);
516        assert!(missing_field_name.is_none());
517
518        let missing_field_value = variant_obj.field(3);
519        assert!(missing_field_value.is_none());
520
521        let missing_field_value = variant_obj.field(300);
522        assert!(missing_field_value.is_none());
523
524        // Test fields iterator
525        let fields: Vec<_> = variant_obj.iter().collect();
526        assert_eq!(fields.len(), 3);
527
528        // Fields should be in sorted order: active, age, name
529        assert_eq!(fields[0].0, "active");
530        assert_eq!(fields[0].1.as_boolean(), Some(true));
531
532        assert_eq!(fields[1].0, "age");
533        assert_eq!(fields[1].1.as_int8(), Some(42));
534
535        assert_eq!(fields[2].0, "name");
536        assert_eq!(fields[2].1.as_string(), Some("hello"));
537
538        // Test field access by index
539        // Fields should be in sorted order: active, age, name
540        assert_eq!(variant_obj.field_name(0), Some("active"));
541        assert_eq!(variant_obj.field(0).unwrap().as_boolean(), Some(true));
542
543        assert_eq!(variant_obj.field_name(1), Some("age"));
544        assert_eq!(variant_obj.field(1).unwrap().as_int8(), Some(42));
545
546        assert_eq!(variant_obj.field_name(2), Some("name"));
547        assert_eq!(variant_obj.field(2).unwrap().as_string(), Some("hello"));
548    }
549
550    #[test]
551    fn test_variant_object_empty_fields() {
552        let mut builder = VariantBuilder::new();
553        builder.new_object().with_field("", 42).finish().unwrap();
554        let (metadata, value) = builder.finish();
555
556        // Resulting object is valid and has a single empty field
557        let variant = Variant::try_new(&metadata, &value).unwrap();
558        let variant_obj = variant.as_object().unwrap();
559        assert_eq!(variant_obj.len(), 1);
560        assert_eq!(variant_obj.get(""), Some(Variant::from(42)));
561    }
562
563    #[test]
564    fn test_variant_object_empty() {
565        // Create metadata with no fields
566        let metadata_bytes = vec![
567            0x11, // header: version=1, sorted=0, offset_size_minus_one=0
568            0,    // dictionary_size = 0
569            0,    // offset[0] = 0 (end of dictionary)
570        ];
571        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
572
573        // Create empty object value data: {}
574        let object_value = vec![
575            0x02, // header: basic_type=2, value_header=0x00
576            0,    // num_elements = 0
577            0,    // single offset pointing to end
578                  // No field IDs, no values
579        ];
580
581        let variant_obj = VariantObject::try_new(metadata, &object_value).unwrap();
582
583        // Test basic properties
584        assert_eq!(variant_obj.len(), 0);
585        assert!(variant_obj.is_empty());
586
587        // Test field access on empty object
588        let missing_field = variant_obj.get("anything");
589        assert!(missing_field.is_none());
590
591        // Test fields iterator on empty object
592        let fields: Vec<_> = variant_obj.iter().collect();
593        assert_eq!(fields.len(), 0);
594    }
595
596    #[test]
597    fn test_variant_object_invalid_metadata_end_offset() {
598        // Create metadata with field names: "age", "name" (sorted)
599        let metadata_bytes = vec![
600            0b0001_0001, // header: version=1, sorted=1, offset_size_minus_one=0
601            2,           // dictionary size
602            0,           // "age"
603            3,           // "name"
604            8,           // Invalid end offset (should be 7)
605            b'a',
606            b'g',
607            b'e',
608            b'n',
609            b'a',
610            b'm',
611            b'e',
612        ];
613        let err = VariantMetadata::try_new(&metadata_bytes);
614        let err = err.unwrap_err();
615        assert!(matches!(
616            err,
617            ArrowError::InvalidArgumentError(ref msg) if msg.contains("Tried to extract byte(s) ..13 from 12-byte buffer")
618        ));
619    }
620
621    #[test]
622    fn test_variant_object_invalid_end_offset() {
623        // Create metadata with field names: "age", "name" (sorted)
624        let metadata_bytes = vec![
625            0b0001_0001, // header: version=1, sorted=1, offset_size_minus_one=0
626            2,           // dictionary size
627            0,           // "age"
628            3,           // "name"
629            7,
630            b'a',
631            b'g',
632            b'e',
633            b'n',
634            b'a',
635            b'm',
636            b'e',
637        ];
638        let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
639
640        // Create object value data for: {"age": 42, "name": "hello"}
641        // Field IDs in sorted order: [0, 1] (age, name)
642        // Header: basic_type=2, field_offset_size_minus_one=0, field_id_size_minus_one=0, is_large=0
643        // value_header = 0000_00_00 = 0x00
644        let object_value = vec![
645            0x02, // header: basic_type=2, value_header=0x00
646            2,    // num_elements = 2
647            // Field IDs (1 byte each): age=0, name=1
648            0, 1,
649            // Field offsets (1 byte each): 3 offsets total
650            0, // offset to first value (int8)
651            2, // offset to second value (short string)
652            9, // invalid end offset (correct would be 8)
653            // Values:
654            0x0C,
655            42, // int8: primitive_header=3, basic_type=0 -> (3 << 2) | 0 = 0x0C, then value 42
656            0x15, b'h', b'e', b'l', b'l',
657            b'o', // short string: length=5, basic_type=1 -> (5 << 2) | 1 = 0x15
658        ];
659
660        let err = VariantObject::try_new(metadata, &object_value);
661        let err = err.unwrap_err();
662        assert!(matches!(
663            err,
664            ArrowError::InvalidArgumentError(ref msg) if msg.contains("Tried to extract byte(s) ..16 from 15-byte buffer")
665        ));
666    }
667
668    fn test_variant_object_with_count(count: i32, expected_field_id_size: OffsetSizeBytes) {
669        let field_names: Vec<_> = (0..count).map(|val| val.to_string()).collect();
670        let mut builder =
671            VariantBuilder::new().with_field_names(field_names.iter().map(|s| s.as_str()));
672
673        let mut obj = builder.new_object();
674
675        for i in 0..count {
676            obj.insert(&field_names[i as usize], i);
677        }
678
679        obj.finish().unwrap();
680        let (metadata, value) = builder.finish();
681        let variant = Variant::new(&metadata, &value);
682
683        if let Variant::Object(obj) = variant {
684            assert_eq!(obj.len(), count as usize);
685
686            assert_eq!(obj.get(&field_names[0]).unwrap(), Variant::Int32(0));
687            assert_eq!(
688                obj.get(&field_names[(count - 1) as usize]).unwrap(),
689                Variant::Int32(count - 1)
690            );
691            assert_eq!(
692                obj.header.field_id_size, expected_field_id_size,
693                "Expected {}-byte field IDs, got {}-byte field IDs",
694                expected_field_id_size as usize, obj.header.field_id_size as usize
695            );
696        } else {
697            panic!("Expected object variant");
698        }
699    }
700
701    #[test]
702    fn test_variant_object_257_elements() {
703        test_variant_object_with_count((1 << 8) + 1, OffsetSizeBytes::Two); // 2^8 + 1, expected 2-byte field IDs
704    }
705
706    #[test]
707    fn test_variant_object_65537_elements() {
708        test_variant_object_with_count((1 << 16) + 1, OffsetSizeBytes::Three);
709        // 2^16 + 1, expected 3-byte field IDs
710    }
711
712    /* Can't run this test now as it takes 45x longer than other tests
713    #[test]
714    fn test_variant_object_16777217_elements() {
715        test_variant_object_with_count((1 << 24) + 1, OffsetSizeBytes::Four);
716        // 2^24 + 1, expected 4-byte field IDs
717    }
718     */
719
720    #[test]
721    fn test_variant_object_small_sizes_255_elements() {
722        test_variant_object_with_count(255, OffsetSizeBytes::One);
723    }
724
725    fn test_variant_object_with_large_data(
726        data_size_per_field: usize,
727        expected_field_offset_size: OffsetSizeBytes,
728    ) {
729        let num_fields = 20;
730        let mut builder = VariantBuilder::new();
731        let mut obj = builder.new_object();
732
733        let str_val = "a".repeat(data_size_per_field);
734
735        for val in 0..num_fields {
736            let key = format!("id_{val}");
737            obj.insert(&key, str_val.as_str());
738        }
739
740        obj.finish().unwrap();
741        let (metadata, value) = builder.finish();
742        let variant = Variant::new(&metadata, &value);
743
744        if let Variant::Object(obj) = variant {
745            assert_eq!(obj.len(), num_fields);
746            assert_eq!(
747                obj.header.field_offset_size, expected_field_offset_size,
748                "Expected {}-byte field offsets, got {}-byte field offsets",
749                expected_field_offset_size as usize, obj.header.field_offset_size as usize
750            );
751        } else {
752            panic!("Expected object variant");
753        }
754    }
755
756    #[test]
757    fn test_variant_object_child_data_0_byte_offsets_minus_one() {
758        test_variant_object_with_large_data(10, OffsetSizeBytes::One);
759    }
760
761    #[test]
762    fn test_variant_object_256_bytes_child_data_3_byte_offsets() {
763        test_variant_object_with_large_data(256 + 1, OffsetSizeBytes::Two); // 2^8 - 2^16 elements
764    }
765
766    #[test]
767    fn test_variant_object_16777216_bytes_child_data_4_byte_offsets() {
768        test_variant_object_with_large_data(65536 + 1, OffsetSizeBytes::Three); // 2^16 - 2^24 elements
769    }
770
771    #[test]
772    fn test_variant_object_65535_bytes_child_data_2_byte_offsets() {
773        test_variant_object_with_large_data(16777216 + 1, OffsetSizeBytes::Four);
774        // 2^24
775    }
776
777    #[test]
778    fn test_objects_with_same_fields_are_equal() {
779        let mut b = VariantBuilder::new();
780        let mut o = b.new_object();
781
782        o.insert("b", ());
783        o.insert("c", ());
784        o.insert("a", ());
785
786        o.finish().unwrap();
787
788        let (m, v) = b.finish();
789
790        let v1 = Variant::try_new(&m, &v).unwrap();
791        let v2 = Variant::try_new(&m, &v).unwrap();
792
793        assert_eq!(v1, v2);
794    }
795
796    #[test]
797    fn test_same_objects_with_different_builder_are_equal() {
798        let mut b = VariantBuilder::new();
799        let mut o = b.new_object();
800
801        o.insert("a", ());
802        o.insert("b", false);
803
804        o.finish().unwrap();
805        let (m, v) = b.finish();
806
807        let v1 = Variant::try_new(&m, &v).unwrap();
808
809        let mut b = VariantBuilder::new();
810        let mut o = b.new_object();
811
812        o.insert("a", ());
813        o.insert("b", false);
814
815        o.finish().unwrap();
816        let (m, v) = b.finish();
817
818        let v2 = Variant::try_new(&m, &v).unwrap();
819
820        assert_eq!(v1, v2);
821    }
822
823    #[test]
824    fn test_objects_with_different_values_are_not_equal() {
825        let mut b = VariantBuilder::new();
826        let mut o = b.new_object();
827
828        o.insert("a", ());
829        o.insert("b", 4.3);
830
831        o.finish().unwrap();
832
833        let (m, v) = b.finish();
834
835        let v1 = Variant::try_new(&m, &v).unwrap();
836
837        // second object, same field name but different values
838        let mut b = VariantBuilder::new();
839        let mut o = b.new_object();
840
841        o.insert("a", ());
842        let mut inner_o = o.new_object("b");
843        inner_o.insert("a", 3.3);
844        inner_o.finish().unwrap();
845        o.finish().unwrap();
846
847        let (m, v) = b.finish();
848
849        let v2 = Variant::try_new(&m, &v).unwrap();
850
851        let m1 = v1.metadata().unwrap();
852        let m2 = v2.metadata().unwrap();
853
854        // metadata would be equal since they contain the same keys
855        assert_eq!(m1, m2);
856
857        // but the objects are not equal
858        assert_ne!(v1, v2);
859    }
860
861    #[test]
862    fn test_objects_with_different_field_names_are_not_equal() {
863        let mut b = VariantBuilder::new();
864        let mut o = b.new_object();
865
866        o.insert("a", ());
867        o.insert("b", 4.3);
868
869        o.finish().unwrap();
870
871        let (m, v) = b.finish();
872
873        let v1 = Variant::try_new(&m, &v).unwrap();
874
875        // second object, same field name but different values
876        let mut b = VariantBuilder::new();
877        let mut o = b.new_object();
878
879        o.insert("aardvark", ());
880        o.insert("barracuda", 3.3);
881
882        o.finish().unwrap();
883
884        let (m, v) = b.finish();
885        let v2 = Variant::try_new(&m, &v).unwrap();
886
887        assert_ne!(v1, v2);
888    }
889
890    #[test]
891    fn test_objects_with_different_insertion_order_are_equal() {
892        let mut b = VariantBuilder::new();
893        let mut o = b.new_object();
894
895        o.insert("b", false);
896        o.insert("a", ());
897
898        o.finish().unwrap();
899
900        let (m, v) = b.finish();
901
902        let v1 = Variant::try_new(&m, &v).unwrap();
903        assert!(!v1.metadata().unwrap().is_sorted());
904
905        // create another object pre-filled with field names, b and a
906        // but insert the fields in the order of a, b
907        let mut b = VariantBuilder::new().with_field_names(["b", "a"].into_iter());
908        let mut o = b.new_object();
909
910        o.insert("a", ());
911        o.insert("b", false);
912
913        o.finish().unwrap();
914
915        let (m, v) = b.finish();
916
917        let v2 = Variant::try_new(&m, &v).unwrap();
918
919        // v2 should also have a unsorted dictionary
920        assert!(!v2.metadata().unwrap().is_sorted());
921
922        assert_eq!(v1, v2);
923    }
924
925    #[test]
926    fn test_objects_with_differing_metadata_are_equal() {
927        let mut b = VariantBuilder::new();
928        let mut o = b.new_object();
929
930        o.insert("a", ());
931        o.insert("b", 4.3);
932
933        o.finish().unwrap();
934
935        let (meta1, value1) = b.finish();
936
937        let v1 = Variant::try_new(&meta1, &value1).unwrap();
938        // v1 is sorted
939        assert!(v1.metadata().unwrap().is_sorted());
940
941        // create a second object with different insertion order
942        let mut b = VariantBuilder::new().with_field_names(["d", "c", "b", "a"].into_iter());
943        let mut o = b.new_object();
944
945        o.insert("b", 4.3);
946        o.insert("a", ());
947
948        o.finish().unwrap();
949
950        let (meta2, value2) = b.finish();
951
952        let v2 = Variant::try_new(&meta2, &value2).unwrap();
953        // v2 is not sorted
954        assert!(!v2.metadata().unwrap().is_sorted());
955
956        // object metadata are not the same
957        assert_ne!(v1.metadata(), v2.metadata());
958
959        // objects are still logically equal
960        assert_eq!(v1, v2);
961    }
962
963    #[test]
964    fn test_compare_object_with_unsorted_dictionary_vs_sorted_dictionary() {
965        // create a sorted object
966        let mut b = VariantBuilder::new();
967        let mut o = b.new_object();
968
969        o.insert("a", false);
970        o.insert("b", false);
971
972        o.finish().unwrap();
973
974        let (m, v) = b.finish();
975
976        let v1 = Variant::try_new(&m, &v).unwrap();
977
978        // Create metadata with an unsorted dictionary (field names are "a", "a", "b")
979        // Since field names are not unique, it is considered not sorted.
980        let metadata_bytes = vec![
981            0b0000_0001,
982            3, // dictionary size
983            0, // "a"
984            1, // "b"
985            2, // "a"
986            3,
987            b'a',
988            b'b',
989            b'a',
990        ];
991        let m = VariantMetadata::try_new(&metadata_bytes).unwrap();
992        assert!(!m.is_sorted());
993
994        let v2 = Variant::new_with_metadata(m, &v);
995        assert_eq!(v1, v2);
996    }
997}