parquet_variant_compute/
shred_variant.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Module for shredding VariantArray with a given schema.
19
20use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
21use crate::variant_to_arrow::{
22    PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder,
23};
24use crate::{VariantArray, VariantValueArrayBuilder};
25use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder};
26use arrow::buffer::NullBuffer;
27use arrow::compute::CastOptions;
28use arrow::datatypes::{DataType, Fields, TimeUnit};
29use arrow::error::{ArrowError, Result};
30use parquet_variant::{Variant, VariantBuilderExt};
31
32use indexmap::IndexMap;
33use std::sync::Arc;
34
35/// Shreds the input binary variant using a target shredding schema derived from the requested data type.
36///
37/// For example, requesting `DataType::Int64` would produce an output variant array with the schema:
38///
39/// ```text
40/// {
41///    metadata: BINARY,
42///    value: BINARY,
43///    typed_value: LONG,
44/// }
45/// ```
46///
47/// Similarly, requesting `DataType::Struct` with two integer fields `a` and `b` would produce an
48/// output variant array with the schema:
49///
50/// ```text
51/// {
52///   metadata: BINARY,
53///   value: BINARY,
54///   typed_value: {
55///     a: {
56///       value: BINARY,
57///       typed_value: INT,
58///     },
59///     b: {
60///       value: BINARY,
61///       typed_value: INT,
62///     },
63///   }
64/// }
65/// ```
66pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<VariantArray> {
67    if array.typed_value_field().is_some() {
68        return Err(ArrowError::InvalidArgumentError(
69            "Input is already shredded".to_string(),
70        ));
71    }
72
73    if array.value_field().is_none() {
74        // all-null case -- nothing to do.
75        return Ok(array.clone());
76    };
77
78    let cast_options = CastOptions::default();
79    let mut builder = make_variant_to_shredded_variant_arrow_row_builder(
80        as_type,
81        &cast_options,
82        array.len(),
83        true,
84    )?;
85    for i in 0..array.len() {
86        if array.is_null(i) {
87            builder.append_null()?;
88        } else {
89            builder.append_value(array.value(i))?;
90        }
91    }
92    let (value, typed_value, nulls) = builder.finish()?;
93    Ok(VariantArray::from_parts(
94        array.metadata_field().clone(),
95        Some(value),
96        Some(typed_value),
97        nulls,
98    ))
99}
100
101pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>(
102    data_type: &'a DataType,
103    cast_options: &'a CastOptions,
104    capacity: usize,
105    top_level: bool,
106) -> Result<VariantToShreddedVariantRowBuilder<'a>> {
107    let builder = match data_type {
108        DataType::Struct(fields) => {
109            let typed_value_builder = VariantToShreddedObjectVariantRowBuilder::try_new(
110                fields,
111                cast_options,
112                capacity,
113                top_level,
114            )?;
115            VariantToShreddedVariantRowBuilder::Object(typed_value_builder)
116        }
117        DataType::List(_)
118        | DataType::LargeList(_)
119        | DataType::ListView(_)
120        | DataType::LargeListView(_)
121        | DataType::FixedSizeList(..) => {
122            return Err(ArrowError::NotYetImplemented(
123                "Shredding variant array values as arrow lists".to_string(),
124            ));
125        }
126        // Supported shredded primitive types, see Variant shredding spec:
127        // https://github.com/apache/parquet-format/blob/master/VariantShredding.md#shredded-value-types
128        DataType::Boolean
129        | DataType::Int8
130        | DataType::Int16
131        | DataType::Int32
132        | DataType::Int64
133        | DataType::Float32
134        | DataType::Float64
135        | DataType::Decimal32(..)
136        | DataType::Decimal64(..)
137        | DataType::Decimal128(..)
138        | DataType::Date32
139        | DataType::Time64(TimeUnit::Microsecond)
140        | DataType::Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _)
141        | DataType::Binary
142        | DataType::BinaryView
143        | DataType::Utf8
144        | DataType::Utf8View
145        | DataType::FixedSizeBinary(16) // UUID
146        => {
147            let builder =
148                make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
149            let typed_value_builder =
150                VariantToShreddedPrimitiveVariantRowBuilder::new(builder, capacity, top_level);
151            VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder)
152        }
153        DataType::FixedSizeBinary(_) => {
154            return Err(ArrowError::InvalidArgumentError(format!("{data_type} is not a valid variant shredding type. Only FixedSizeBinary(16) for UUID is supported.")))
155        }
156        _ => {
157            return Err(ArrowError::InvalidArgumentError(format!("{data_type} is not a valid variant shredding type")))
158        }
159    };
160    Ok(builder)
161}
162
163pub(crate) enum VariantToShreddedVariantRowBuilder<'a> {
164    Primitive(VariantToShreddedPrimitiveVariantRowBuilder<'a>),
165    Object(VariantToShreddedObjectVariantRowBuilder<'a>),
166}
167impl<'a> VariantToShreddedVariantRowBuilder<'a> {
168    pub fn append_null(&mut self) -> Result<()> {
169        use VariantToShreddedVariantRowBuilder::*;
170        match self {
171            Primitive(b) => b.append_null(),
172            Object(b) => b.append_null(),
173        }
174    }
175
176    pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
177        use VariantToShreddedVariantRowBuilder::*;
178        match self {
179            Primitive(b) => b.append_value(value),
180            Object(b) => b.append_value(value),
181        }
182    }
183
184    pub fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)> {
185        use VariantToShreddedVariantRowBuilder::*;
186        match self {
187            Primitive(b) => b.finish(),
188            Object(b) => b.finish(),
189        }
190    }
191}
192
193/// A top-level variant shredder -- appending NULL produces typed_value=NULL and value=Variant::Null
194pub(crate) struct VariantToShreddedPrimitiveVariantRowBuilder<'a> {
195    value_builder: VariantValueArrayBuilder,
196    typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>,
197    nulls: NullBufferBuilder,
198    top_level: bool,
199}
200
201impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> {
202    pub(crate) fn new(
203        typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>,
204        capacity: usize,
205        top_level: bool,
206    ) -> Self {
207        Self {
208            value_builder: VariantValueArrayBuilder::new(capacity),
209            typed_value_builder,
210            nulls: NullBufferBuilder::new(capacity),
211            top_level,
212        }
213    }
214    fn append_null(&mut self) -> Result<()> {
215        // Only the top-level struct that represents the variant can be nullable; object fields and
216        // array elements are non-nullable.
217        self.nulls.append(!self.top_level);
218        self.value_builder.append_null();
219        self.typed_value_builder.append_null()
220    }
221    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
222        self.nulls.append_non_null();
223        if self.typed_value_builder.append_value(&value)? {
224            self.value_builder.append_null();
225        } else {
226            self.value_builder.append_value(value);
227        }
228        Ok(true)
229    }
230    fn finish(mut self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)> {
231        Ok((
232            self.value_builder.build()?,
233            self.typed_value_builder.finish()?,
234            self.nulls.finish(),
235        ))
236    }
237}
238
239pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> {
240    value_builder: VariantValueArrayBuilder,
241    typed_value_builders: IndexMap<&'a str, VariantToShreddedVariantRowBuilder<'a>>,
242    typed_value_nulls: NullBufferBuilder,
243    nulls: NullBufferBuilder,
244    top_level: bool,
245}
246
247impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
248    fn try_new(
249        fields: &'a Fields,
250        cast_options: &'a CastOptions,
251        capacity: usize,
252        top_level: bool,
253    ) -> Result<Self> {
254        let typed_value_builders = fields.iter().map(|field| {
255            let builder = make_variant_to_shredded_variant_arrow_row_builder(
256                field.data_type(),
257                cast_options,
258                capacity,
259                false,
260            )?;
261            Ok((field.name().as_str(), builder))
262        });
263        Ok(Self {
264            value_builder: VariantValueArrayBuilder::new(capacity),
265            typed_value_builders: typed_value_builders.collect::<Result<_>>()?,
266            typed_value_nulls: NullBufferBuilder::new(capacity),
267            nulls: NullBufferBuilder::new(capacity),
268            top_level,
269        })
270    }
271
272    fn append_null(&mut self) -> Result<()> {
273        // Only the top-level struct that represents the variant can be nullable; object fields and
274        // array elements are non-nullable.
275        self.nulls.append(!self.top_level);
276        self.value_builder.append_null();
277        self.typed_value_nulls.append_null();
278        for (_, typed_value_builder) in &mut self.typed_value_builders {
279            typed_value_builder.append_null()?;
280        }
281        Ok(())
282    }
283    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
284        let Variant::Object(ref obj) = value else {
285            // Not an object => fall back
286            self.nulls.append_non_null();
287            self.value_builder.append_value(value);
288            self.typed_value_nulls.append_null();
289            for (_, typed_value_builder) in &mut self.typed_value_builders {
290                typed_value_builder.append_null()?;
291            }
292            return Ok(false);
293        };
294
295        // Route the object's fields by name as either shredded or unshredded
296        let mut builder = self.value_builder.builder_ext(value.metadata());
297        let mut object_builder = builder.try_new_object()?;
298        let mut seen = std::collections::HashSet::new();
299        let mut partially_shredded = false;
300        for (field_name, value) in obj.iter() {
301            match self.typed_value_builders.get_mut(field_name) {
302                Some(typed_value_builder) => {
303                    typed_value_builder.append_value(value)?;
304                    seen.insert(field_name);
305                }
306                None => {
307                    object_builder.insert_bytes(field_name, value);
308                    partially_shredded = true;
309                }
310            }
311        }
312
313        // Handle missing fields
314        for (field_name, typed_value_builder) in &mut self.typed_value_builders {
315            if !seen.contains(field_name) {
316                typed_value_builder.append_null()?;
317            }
318        }
319
320        // Only emit the value if it captured any unshredded object fields
321        if partially_shredded {
322            object_builder.finish();
323        } else {
324            drop(object_builder);
325            self.value_builder.append_null();
326        }
327
328        self.typed_value_nulls.append_non_null();
329        self.nulls.append_non_null();
330        Ok(true)
331    }
332    fn finish(mut self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)> {
333        let mut builder = StructArrayBuilder::new();
334        for (field_name, typed_value_builder) in self.typed_value_builders {
335            let (value, typed_value, nulls) = typed_value_builder.finish()?;
336            let array =
337                ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
338            builder = builder.with_field(field_name, ArrayRef::from(array), false);
339        }
340        if let Some(nulls) = self.typed_value_nulls.finish() {
341            builder = builder.with_nulls(nulls);
342        }
343        Ok((
344            self.value_builder.build()?,
345            Arc::new(builder.build()),
346            self.nulls.finish(),
347        ))
348    }
349}
350
351#[cfg(test)]
352mod tests {
353    use super::*;
354    use crate::VariantArrayBuilder;
355    use arrow::array::{Array, FixedSizeBinaryArray, Float64Array, Int64Array};
356    use arrow::datatypes::{DataType, Field, Fields, TimeUnit, UnionFields, UnionMode};
357    use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant, VariantBuilder};
358    use std::sync::Arc;
359    use uuid::Uuid;
360
361    #[test]
362    fn test_already_shredded_input_error() {
363        // Create a VariantArray that already has typed_value_field
364        // First create a valid VariantArray, then extract its parts to construct a shredded one
365        let temp_array = VariantArray::from_iter(vec![Some(Variant::from("test"))]);
366        let metadata = temp_array.metadata_field().clone();
367        let value = temp_array.value_field().unwrap().clone();
368        let typed_value = Arc::new(Int64Array::from(vec![42])) as ArrayRef;
369
370        let shredded_array =
371            VariantArray::from_parts(metadata, Some(value), Some(typed_value), None);
372
373        let result = shred_variant(&shredded_array, &DataType::Int64);
374        assert!(matches!(
375            result.unwrap_err(),
376            ArrowError::InvalidArgumentError(_)
377        ));
378    }
379
380    #[test]
381    fn test_all_null_input() {
382        // Create VariantArray with no value field (all null case)
383        let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata
384        let all_null_array = VariantArray::from_parts(metadata, None, None, None);
385        let result = shred_variant(&all_null_array, &DataType::Int64).unwrap();
386
387        // Should return array with no value/typed_value fields
388        assert!(result.value_field().is_none());
389        assert!(result.typed_value_field().is_none());
390    }
391
392    #[test]
393    fn test_unsupported_list_schema() {
394        let input = VariantArray::from_iter([Variant::from(42)]);
395        let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Int64, true)));
396        shred_variant(&input, &list_schema).expect_err("unsupported");
397    }
398
399    #[test]
400    fn test_invalid_fixed_size_binary_shredding() {
401        let mock_uuid_1 = Uuid::new_v4();
402
403        let input = VariantArray::from_iter([Some(Variant::from(mock_uuid_1)), None]);
404
405        // shred_variant only supports FixedSizeBinary(16). Any other length will err.
406        let err = shred_variant(&input, &DataType::FixedSizeBinary(17)).unwrap_err();
407
408        assert_eq!(
409            err.to_string(),
410            "Invalid argument error: FixedSizeBinary(17) is not a valid variant shredding type. Only FixedSizeBinary(16) for UUID is supported."
411        );
412    }
413
414    #[test]
415    fn test_uuid_shredding() {
416        let mock_uuid_1 = Uuid::new_v4();
417        let mock_uuid_2 = Uuid::new_v4();
418
419        let input = VariantArray::from_iter([
420            Some(Variant::from(mock_uuid_1)),
421            None,
422            Some(Variant::from(false)),
423            Some(Variant::from(mock_uuid_2)),
424        ]);
425
426        let variant_array = shred_variant(&input, &DataType::FixedSizeBinary(16)).unwrap();
427
428        // // inspect the typed_value Field and make sure it contains the canonical Uuid extension type
429        // let typed_value_field = variant_array
430        //     .inner()
431        //     .fields()
432        //     .into_iter()
433        //     .find(|f| f.name() == "typed_value")
434        //     .unwrap();
435
436        // assert!(
437        //     typed_value_field
438        //         .try_extension_type::<extension::Uuid>()
439        //         .is_ok()
440        // );
441
442        // probe the downcasted typed_value array to make sure uuids are shredded correctly
443        let uuids = variant_array
444            .typed_value_field()
445            .unwrap()
446            .as_any()
447            .downcast_ref::<FixedSizeBinaryArray>()
448            .unwrap();
449
450        assert_eq!(uuids.len(), 4);
451
452        assert!(!uuids.is_null(0));
453
454        let got_uuid_1: &[u8] = uuids.value(0);
455        assert_eq!(got_uuid_1, mock_uuid_1.as_bytes());
456
457        assert!(uuids.is_null(1));
458        assert!(uuids.is_null(2));
459
460        assert!(!uuids.is_null(3));
461
462        let got_uuid_2: &[u8] = uuids.value(3);
463        assert_eq!(got_uuid_2, mock_uuid_2.as_bytes());
464    }
465
466    #[test]
467    fn test_primitive_shredding_comprehensive() {
468        // Test mixed scenarios in a single array
469        let input = VariantArray::from_iter(vec![
470            Some(Variant::from(42i64)),   // successful shred
471            Some(Variant::from("hello")), // failed shred (string)
472            Some(Variant::from(100i64)),  // successful shred
473            None,                         // array-level null
474            Some(Variant::Null),          // variant null
475            Some(Variant::from(3i8)),     // successful shred (int8->int64 conversion)
476        ]);
477
478        let result = shred_variant(&input, &DataType::Int64).unwrap();
479
480        // Verify structure
481        let metadata_field = result.metadata_field();
482        let value_field = result.value_field().unwrap();
483        let typed_value_field = result
484            .typed_value_field()
485            .unwrap()
486            .as_any()
487            .downcast_ref::<Int64Array>()
488            .unwrap();
489
490        // Check specific outcomes for each row
491        assert_eq!(result.len(), 6);
492
493        // Row 0: 42 -> should shred successfully
494        assert!(!result.is_null(0));
495        assert!(value_field.is_null(0)); // value should be null when shredded
496        assert!(!typed_value_field.is_null(0));
497        assert_eq!(typed_value_field.value(0), 42);
498
499        // Row 1: "hello" -> should fail to shred
500        assert!(!result.is_null(1));
501        assert!(!value_field.is_null(1)); // value should contain original
502        assert!(typed_value_field.is_null(1)); // typed_value should be null
503        assert_eq!(
504            Variant::new(metadata_field.value(1), value_field.value(1)),
505            Variant::from("hello")
506        );
507
508        // Row 2: 100 -> should shred successfully
509        assert!(!result.is_null(2));
510        assert!(value_field.is_null(2));
511        assert_eq!(typed_value_field.value(2), 100);
512
513        // Row 3: array null -> should be null in result
514        assert!(result.is_null(3));
515
516        // Row 4: Variant::Null -> should not shred (it's a null variant, not an integer)
517        assert!(!result.is_null(4));
518        assert!(!value_field.is_null(4)); // should contain Variant::Null
519        assert_eq!(
520            Variant::new(metadata_field.value(4), value_field.value(4)),
521            Variant::Null
522        );
523        assert!(typed_value_field.is_null(4));
524
525        // Row 5: 3i8 -> should shred successfully (int8->int64 conversion)
526        assert!(!result.is_null(5));
527        assert!(value_field.is_null(5)); // value should be null when shredded
528        assert!(!typed_value_field.is_null(5));
529        assert_eq!(typed_value_field.value(5), 3);
530    }
531
532    #[test]
533    fn test_primitive_different_target_types() {
534        let input = VariantArray::from_iter(vec![
535            Variant::from(42i32),
536            Variant::from(3.15f64),
537            Variant::from("not_a_number"),
538        ]);
539
540        // Test Int32 target
541        let result_int32 = shred_variant(&input, &DataType::Int32).unwrap();
542        let typed_value_int32 = result_int32
543            .typed_value_field()
544            .unwrap()
545            .as_any()
546            .downcast_ref::<arrow::array::Int32Array>()
547            .unwrap();
548        assert_eq!(typed_value_int32.value(0), 42);
549        assert!(typed_value_int32.is_null(1)); // float doesn't convert to int32
550        assert!(typed_value_int32.is_null(2)); // string doesn't convert to int32
551
552        // Test Float64 target
553        let result_float64 = shred_variant(&input, &DataType::Float64).unwrap();
554        let typed_value_float64 = result_float64
555            .typed_value_field()
556            .unwrap()
557            .as_any()
558            .downcast_ref::<Float64Array>()
559            .unwrap();
560        assert_eq!(typed_value_float64.value(0), 42.0); // int converts to float
561        assert_eq!(typed_value_float64.value(1), 3.15);
562        assert!(typed_value_float64.is_null(2)); // string doesn't convert
563    }
564
565    #[test]
566    fn test_invalid_shredded_types_rejected() {
567        let input = VariantArray::from_iter([Variant::from(42)]);
568
569        let invalid_types = vec![
570            DataType::UInt8,
571            DataType::Float16,
572            DataType::Decimal256(38, 10),
573            DataType::Date64,
574            DataType::Time32(TimeUnit::Second),
575            DataType::Time64(TimeUnit::Nanosecond),
576            DataType::Timestamp(TimeUnit::Millisecond, None),
577            DataType::LargeBinary,
578            DataType::LargeUtf8,
579            DataType::FixedSizeBinary(17),
580            DataType::Union(
581                UnionFields::new(
582                    vec![0_i8, 1_i8],
583                    vec![
584                        Field::new("int_field", DataType::Int32, false),
585                        Field::new("str_field", DataType::Utf8, true),
586                    ],
587                ),
588                UnionMode::Dense,
589            ),
590            DataType::Map(
591                Arc::new(Field::new(
592                    "entries",
593                    DataType::Struct(Fields::from(vec![
594                        Field::new("key", DataType::Utf8, false),
595                        Field::new("value", DataType::Int32, true),
596                    ])),
597                    false,
598                )),
599                false,
600            ),
601            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
602            DataType::RunEndEncoded(
603                Arc::new(Field::new("run_ends", DataType::Int32, false)),
604                Arc::new(Field::new("values", DataType::Utf8, true)),
605            ),
606        ];
607
608        for data_type in invalid_types {
609            let err = shred_variant(&input, &data_type).unwrap_err();
610            assert!(
611                matches!(err, ArrowError::InvalidArgumentError(_)),
612                "expected InvalidArgumentError for {:?}, got {:?}",
613                data_type,
614                err
615            );
616        }
617    }
618
619    #[test]
620    fn test_object_shredding_comprehensive() {
621        let mut builder = VariantArrayBuilder::new(7);
622
623        // Row 0: Fully shredded object
624        builder
625            .new_object()
626            .with_field("score", 95.5f64)
627            .with_field("age", 30i64)
628            .finish();
629
630        // Row 1: Partially shredded object (extra email field)
631        builder
632            .new_object()
633            .with_field("score", 87.2f64)
634            .with_field("age", 25i64)
635            .with_field("email", "bob@example.com")
636            .finish();
637
638        // Row 2: Missing field (no score)
639        builder.new_object().with_field("age", 35i64).finish();
640
641        // Row 3: Type mismatch (score is string, age is string)
642        builder
643            .new_object()
644            .with_field("score", "ninety-five")
645            .with_field("age", "thirty")
646            .finish();
647
648        // Row 4: Non-object
649        builder.append_variant(Variant::from("not an object"));
650
651        // Row 5: Empty object
652        builder.new_object().finish();
653
654        // Row 6: Null
655        builder.append_null();
656
657        // Row 7: Object with only "wrong" fields
658        builder.new_object().with_field("foo", 10).finish();
659
660        // Row 8: Object with one "right" and one "wrong" field
661        builder
662            .new_object()
663            .with_field("score", 66.67f64)
664            .with_field("foo", 10)
665            .finish();
666
667        let input = builder.build();
668
669        // Create target schema: struct<score: float64, age: int64>
670        // Both types are supported for shredding
671        let fields = Fields::from(vec![
672            Field::new("score", DataType::Float64, true),
673            Field::new("age", DataType::Int64, true),
674        ]);
675        let target_schema = DataType::Struct(fields);
676
677        let result = shred_variant(&input, &target_schema).unwrap();
678
679        // Verify structure
680        assert!(result.value_field().is_some());
681        assert!(result.typed_value_field().is_some());
682        assert_eq!(result.len(), 9);
683
684        let metadata = result.metadata_field();
685
686        let value = result.value_field().unwrap();
687        let typed_value = result
688            .typed_value_field()
689            .unwrap()
690            .as_any()
691            .downcast_ref::<arrow::array::StructArray>()
692            .unwrap();
693
694        // Extract score and age fields from typed_value struct
695        let score_field =
696            ShreddedVariantFieldArray::try_new(typed_value.column_by_name("score").unwrap())
697                .unwrap();
698        let age_field =
699            ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap();
700
701        let score_value = score_field
702            .value_field()
703            .unwrap()
704            .as_any()
705            .downcast_ref::<BinaryViewArray>()
706            .unwrap();
707        let score_typed_value = score_field
708            .typed_value_field()
709            .unwrap()
710            .as_any()
711            .downcast_ref::<Float64Array>()
712            .unwrap();
713        let age_value = age_field
714            .value_field()
715            .unwrap()
716            .as_any()
717            .downcast_ref::<BinaryViewArray>()
718            .unwrap();
719        let age_typed_value = age_field
720            .typed_value_field()
721            .unwrap()
722            .as_any()
723            .downcast_ref::<Int64Array>()
724            .unwrap();
725
726        // Set up exhaustive checking of all shredded columns and their nulls/values
727        struct ShreddedValue<'m, 'v, T> {
728            value: Option<Variant<'m, 'v>>,
729            typed_value: Option<T>,
730        }
731        struct ShreddedStruct<'m, 'v> {
732            score: ShreddedValue<'m, 'v, f64>,
733            age: ShreddedValue<'m, 'v, i64>,
734        }
735        fn get_value<'m, 'v>(
736            i: usize,
737            metadata: &'m BinaryViewArray,
738            value: &'v BinaryViewArray,
739        ) -> Variant<'m, 'v> {
740            Variant::new(metadata.value(i), value.value(i))
741        }
742        let expect = |i, expected_result: Option<ShreddedValue<ShreddedStruct>>| {
743            match expected_result {
744                Some(ShreddedValue {
745                    value: expected_value,
746                    typed_value: expected_typed_value,
747                }) => {
748                    assert!(result.is_valid(i));
749                    match expected_value {
750                        Some(expected_value) => {
751                            assert!(value.is_valid(i));
752                            assert_eq!(expected_value, get_value(i, metadata, value));
753                        }
754                        None => {
755                            assert!(value.is_null(i));
756                        }
757                    }
758                    match expected_typed_value {
759                        Some(ShreddedStruct {
760                            score: expected_score,
761                            age: expected_age,
762                        }) => {
763                            assert!(typed_value.is_valid(i));
764                            assert!(score_field.is_valid(i)); // non-nullable
765                            assert!(age_field.is_valid(i)); // non-nullable
766                            match expected_score.value {
767                                Some(expected_score_value) => {
768                                    assert!(score_value.is_valid(i));
769                                    assert_eq!(
770                                        expected_score_value,
771                                        get_value(i, metadata, score_value)
772                                    );
773                                }
774                                None => {
775                                    assert!(score_value.is_null(i));
776                                }
777                            }
778                            match expected_score.typed_value {
779                                Some(expected_score) => {
780                                    assert!(score_typed_value.is_valid(i));
781                                    assert_eq!(expected_score, score_typed_value.value(i));
782                                }
783                                None => {
784                                    assert!(score_typed_value.is_null(i));
785                                }
786                            }
787                            match expected_age.value {
788                                Some(expected_age_value) => {
789                                    assert!(age_value.is_valid(i));
790                                    assert_eq!(
791                                        expected_age_value,
792                                        get_value(i, metadata, age_value)
793                                    );
794                                }
795                                None => {
796                                    assert!(age_value.is_null(i));
797                                }
798                            }
799                            match expected_age.typed_value {
800                                Some(expected_age) => {
801                                    assert!(age_typed_value.is_valid(i));
802                                    assert_eq!(expected_age, age_typed_value.value(i));
803                                }
804                                None => {
805                                    assert!(age_typed_value.is_null(i));
806                                }
807                            }
808                        }
809                        None => {
810                            assert!(typed_value.is_null(i));
811                        }
812                    }
813                }
814                None => {
815                    assert!(result.is_null(i));
816                }
817            };
818        };
819
820        // Row 0: Fully shredded - both fields shred successfully
821        expect(
822            0,
823            Some(ShreddedValue {
824                value: None,
825                typed_value: Some(ShreddedStruct {
826                    score: ShreddedValue {
827                        value: None,
828                        typed_value: Some(95.5),
829                    },
830                    age: ShreddedValue {
831                        value: None,
832                        typed_value: Some(30),
833                    },
834                }),
835            }),
836        );
837
838        // Row 1: Partially shredded - value contains extra email field
839        let mut builder = VariantBuilder::new();
840        builder
841            .new_object()
842            .with_field("email", "bob@example.com")
843            .finish();
844        let (m, v) = builder.finish();
845        let expected_value = Variant::new(&m, &v);
846
847        expect(
848            1,
849            Some(ShreddedValue {
850                value: Some(expected_value),
851                typed_value: Some(ShreddedStruct {
852                    score: ShreddedValue {
853                        value: None,
854                        typed_value: Some(87.2),
855                    },
856                    age: ShreddedValue {
857                        value: None,
858                        typed_value: Some(25),
859                    },
860                }),
861            }),
862        );
863
864        // Row 2: Fully shredded -- missing score field
865        expect(
866            2,
867            Some(ShreddedValue {
868                value: None,
869                typed_value: Some(ShreddedStruct {
870                    score: ShreddedValue {
871                        value: None,
872                        typed_value: None,
873                    },
874                    age: ShreddedValue {
875                        value: None,
876                        typed_value: Some(35),
877                    },
878                }),
879            }),
880        );
881
882        // Row 3: Type mismatches - both score and age are strings
883        expect(
884            3,
885            Some(ShreddedValue {
886                value: None,
887                typed_value: Some(ShreddedStruct {
888                    score: ShreddedValue {
889                        value: Some(Variant::from("ninety-five")),
890                        typed_value: None,
891                    },
892                    age: ShreddedValue {
893                        value: Some(Variant::from("thirty")),
894                        typed_value: None,
895                    },
896                }),
897            }),
898        );
899
900        // Row 4: Non-object - falls back to value field
901        expect(
902            4,
903            Some(ShreddedValue {
904                value: Some(Variant::from("not an object")),
905                typed_value: None,
906            }),
907        );
908
909        // Row 5: Empty object
910        expect(
911            5,
912            Some(ShreddedValue {
913                value: None,
914                typed_value: Some(ShreddedStruct {
915                    score: ShreddedValue {
916                        value: None,
917                        typed_value: None,
918                    },
919                    age: ShreddedValue {
920                        value: None,
921                        typed_value: None,
922                    },
923                }),
924            }),
925        );
926
927        // Row 6: Null
928        expect(6, None);
929
930        // Helper to correctly create a variant object using a row's existing metadata
931        let object_with_foo_field = |i| {
932            use parquet_variant::{ParentState, ValueBuilder, VariantMetadata};
933            let metadata = VariantMetadata::new(metadata.value(i));
934            let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
935            let mut value_builder = ValueBuilder::new();
936            let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
937            ObjectBuilder::new(state, false)
938                .with_field("foo", 10)
939                .finish();
940            (metadata, value_builder.into_inner())
941        };
942
943        // Row 7: Object with only a "wrong" field
944        let (m, v) = object_with_foo_field(7);
945        expect(
946            7,
947            Some(ShreddedValue {
948                value: Some(Variant::new_with_metadata(m, &v)),
949                typed_value: Some(ShreddedStruct {
950                    score: ShreddedValue {
951                        value: None,
952                        typed_value: None,
953                    },
954                    age: ShreddedValue {
955                        value: None,
956                        typed_value: None,
957                    },
958                }),
959            }),
960        );
961
962        // Row 8: Object with one "wrong" and one "right" field
963        let (m, v) = object_with_foo_field(8);
964        expect(
965            8,
966            Some(ShreddedValue {
967                value: Some(Variant::new_with_metadata(m, &v)),
968                typed_value: Some(ShreddedStruct {
969                    score: ShreddedValue {
970                        value: None,
971                        typed_value: Some(66.67),
972                    },
973                    age: ShreddedValue {
974                        value: None,
975                        typed_value: None,
976                    },
977                }),
978            }),
979        );
980    }
981
982    #[test]
983    fn test_object_different_schemas() {
984        // Create object with multiple fields
985        let mut builder = VariantArrayBuilder::new(1);
986        builder
987            .new_object()
988            .with_field("id", 123i32)
989            .with_field("age", 25i64)
990            .with_field("score", 95.5f64)
991            .finish();
992        let input = builder.build();
993
994        // Test with schema containing only id field
995        let schema1 = DataType::Struct(Fields::from(vec![Field::new("id", DataType::Int32, true)]));
996        let result1 = shred_variant(&input, &schema1).unwrap();
997        let value_field1 = result1.value_field().unwrap();
998        assert!(!value_field1.is_null(0)); // should contain {"age": 25, "score": 95.5}
999
1000        // Test with schema containing id and age fields
1001        let schema2 = DataType::Struct(Fields::from(vec![
1002            Field::new("id", DataType::Int32, true),
1003            Field::new("age", DataType::Int64, true),
1004        ]));
1005        let result2 = shred_variant(&input, &schema2).unwrap();
1006        let value_field2 = result2.value_field().unwrap();
1007        assert!(!value_field2.is_null(0)); // should contain {"score": 95.5}
1008
1009        // Test with schema containing all fields
1010        let schema3 = DataType::Struct(Fields::from(vec![
1011            Field::new("id", DataType::Int32, true),
1012            Field::new("age", DataType::Int64, true),
1013            Field::new("score", DataType::Float64, true),
1014        ]));
1015        let result3 = shred_variant(&input, &schema3).unwrap();
1016        let value_field3 = result3.value_field().unwrap();
1017        assert!(value_field3.is_null(0)); // fully shredded, no remaining fields
1018    }
1019
1020    #[test]
1021    fn test_uuid_shredding_in_objects() {
1022        let mock_uuid_1 = Uuid::new_v4();
1023        let mock_uuid_2 = Uuid::new_v4();
1024        let mock_uuid_3 = Uuid::new_v4();
1025
1026        let mut builder = VariantArrayBuilder::new(6);
1027
1028        // Row 0: Fully shredded object with both UUID fields
1029        builder
1030            .new_object()
1031            .with_field("id", mock_uuid_1)
1032            .with_field("session_id", mock_uuid_2)
1033            .finish();
1034
1035        // Row 1: Partially shredded object - UUID fields plus extra field
1036        builder
1037            .new_object()
1038            .with_field("id", mock_uuid_2)
1039            .with_field("session_id", mock_uuid_3)
1040            .with_field("name", "test_user")
1041            .finish();
1042
1043        // Row 2: Missing UUID field (no session_id)
1044        builder.new_object().with_field("id", mock_uuid_1).finish();
1045
1046        // Row 3: Type mismatch - id is UUID but session_id is a string
1047        builder
1048            .new_object()
1049            .with_field("id", mock_uuid_3)
1050            .with_field("session_id", "not-a-uuid")
1051            .finish();
1052
1053        // Row 4: Object with non-UUID value in id field
1054        builder
1055            .new_object()
1056            .with_field("id", 12345i64)
1057            .with_field("session_id", mock_uuid_1)
1058            .finish();
1059
1060        // Row 5: Null
1061        builder.append_null();
1062
1063        let input = builder.build();
1064
1065        let fields = Fields::from(vec![
1066            Field::new("id", DataType::FixedSizeBinary(16), true),
1067            Field::new("session_id", DataType::FixedSizeBinary(16), true),
1068        ]);
1069        let target_schema = DataType::Struct(fields);
1070
1071        let result = shred_variant(&input, &target_schema).unwrap();
1072
1073        assert!(result.value_field().is_some());
1074        assert!(result.typed_value_field().is_some());
1075        assert_eq!(result.len(), 6);
1076
1077        let metadata = result.metadata_field();
1078        let value = result.value_field().unwrap();
1079        let typed_value = result
1080            .typed_value_field()
1081            .unwrap()
1082            .as_any()
1083            .downcast_ref::<arrow::array::StructArray>()
1084            .unwrap();
1085
1086        // Extract id and session_id fields from typed_value struct
1087        let id_field =
1088            ShreddedVariantFieldArray::try_new(typed_value.column_by_name("id").unwrap()).unwrap();
1089        let session_id_field =
1090            ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
1091                .unwrap();
1092
1093        let id_value = id_field
1094            .value_field()
1095            .unwrap()
1096            .as_any()
1097            .downcast_ref::<BinaryViewArray>()
1098            .unwrap();
1099        let id_typed_value = id_field
1100            .typed_value_field()
1101            .unwrap()
1102            .as_any()
1103            .downcast_ref::<FixedSizeBinaryArray>()
1104            .unwrap();
1105        let session_id_value = session_id_field
1106            .value_field()
1107            .unwrap()
1108            .as_any()
1109            .downcast_ref::<BinaryViewArray>()
1110            .unwrap();
1111        let session_id_typed_value = session_id_field
1112            .typed_value_field()
1113            .unwrap()
1114            .as_any()
1115            .downcast_ref::<FixedSizeBinaryArray>()
1116            .unwrap();
1117
1118        // Row 0: Fully shredded - both UUID fields shred successfully
1119        assert!(result.is_valid(0));
1120
1121        assert!(value.is_null(0)); // fully shredded, no remaining fields
1122        assert!(id_value.is_null(0));
1123        assert!(session_id_value.is_null(0));
1124
1125        assert!(typed_value.is_valid(0));
1126        assert!(id_typed_value.is_valid(0));
1127        assert!(session_id_typed_value.is_valid(0));
1128
1129        assert_eq!(id_typed_value.value(0), mock_uuid_1.as_bytes());
1130        assert_eq!(session_id_typed_value.value(0), mock_uuid_2.as_bytes());
1131
1132        // Row 1: Partially shredded - value contains extra name field
1133        assert!(result.is_valid(1));
1134
1135        assert!(value.is_valid(1)); // contains unshredded "name" field
1136        assert!(typed_value.is_valid(1));
1137
1138        assert!(id_value.is_null(1));
1139        assert!(id_typed_value.is_valid(1));
1140        assert_eq!(id_typed_value.value(1), mock_uuid_2.as_bytes());
1141
1142        assert!(session_id_value.is_null(1));
1143        assert!(session_id_typed_value.is_valid(1));
1144        assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());
1145
1146        // Verify the value field contains the name field
1147        let row_1_variant = Variant::new(metadata.value(1), value.value(1));
1148        let Variant::Object(obj) = row_1_variant else {
1149            panic!("Expected object");
1150        };
1151
1152        assert_eq!(obj.get("name"), Some(Variant::from("test_user")));
1153
1154        // Row 2: Missing session_id field
1155        assert!(result.is_valid(2));
1156
1157        assert!(value.is_null(2)); // fully shredded, no extra fields
1158        assert!(typed_value.is_valid(2));
1159
1160        assert!(id_value.is_null(2));
1161        assert!(id_typed_value.is_valid(2));
1162        assert_eq!(id_typed_value.value(2), mock_uuid_1.as_bytes());
1163
1164        assert!(session_id_value.is_null(2));
1165        assert!(session_id_typed_value.is_null(2)); // missing field
1166
1167        // Row 3: Type mismatch - session_id is a string, not UUID
1168        assert!(result.is_valid(3));
1169
1170        assert!(value.is_null(3)); // no extra fields
1171        assert!(typed_value.is_valid(3));
1172
1173        assert!(id_value.is_null(3));
1174        assert!(id_typed_value.is_valid(3));
1175        assert_eq!(id_typed_value.value(3), mock_uuid_3.as_bytes());
1176
1177        assert!(session_id_value.is_valid(3)); // type mismatch, stored in value
1178        assert!(session_id_typed_value.is_null(3));
1179        let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3));
1180        assert_eq!(session_id_variant, Variant::from("not-a-uuid"));
1181
1182        // Row 4: Type mismatch - id is int64, not UUID
1183        assert!(result.is_valid(4));
1184
1185        assert!(value.is_null(4)); // no extra fields
1186        assert!(typed_value.is_valid(4));
1187
1188        assert!(id_value.is_valid(4)); // type mismatch, stored in value
1189        assert!(id_typed_value.is_null(4));
1190        let id_variant = Variant::new(metadata.value(4), id_value.value(4));
1191        assert_eq!(id_variant, Variant::from(12345i64));
1192
1193        assert!(session_id_value.is_null(4));
1194        assert!(session_id_typed_value.is_valid(4));
1195        assert_eq!(session_id_typed_value.value(4), mock_uuid_1.as_bytes());
1196
1197        // Row 5: Null
1198        assert!(result.is_null(5));
1199    }
1200
1201    #[test]
1202    fn test_spec_compliance() {
1203        let input = VariantArray::from_iter(vec![Variant::from(42i64), Variant::from("hello")]);
1204
1205        let result = shred_variant(&input, &DataType::Int64).unwrap();
1206
1207        // Test field access by name (not position)
1208        let inner_struct = result.inner();
1209        assert!(inner_struct.column_by_name("metadata").is_some());
1210        assert!(inner_struct.column_by_name("value").is_some());
1211        assert!(inner_struct.column_by_name("typed_value").is_some());
1212
1213        // Test metadata preservation
1214        assert_eq!(result.metadata_field().len(), input.metadata_field().len());
1215        // The metadata should be the same reference (cheap clone)
1216        // Note: BinaryViewArray doesn't have a .values() method, so we compare the arrays directly
1217        assert_eq!(result.metadata_field().len(), input.metadata_field().len());
1218
1219        // Test output structure correctness
1220        assert_eq!(result.len(), input.len());
1221        assert!(result.value_field().is_some());
1222        assert!(result.typed_value_field().is_some());
1223
1224        // For primitive shredding, verify that value and typed_value are never both non-null
1225        // (This rule applies to primitives; for objects, both can be non-null for partial shredding)
1226        let value_field = result.value_field().unwrap();
1227        let typed_value_field = result
1228            .typed_value_field()
1229            .unwrap()
1230            .as_any()
1231            .downcast_ref::<Int64Array>()
1232            .unwrap();
1233
1234        for i in 0..result.len() {
1235            if !result.is_null(i) {
1236                let value_is_null = value_field.is_null(i);
1237                let typed_value_is_null = typed_value_field.is_null(i);
1238                // For primitive shredding, at least one should be null
1239                assert!(
1240                    value_is_null || typed_value_is_null,
1241                    "Row {}: both value and typed_value are non-null for primitive shredding",
1242                    i
1243                );
1244            }
1245        }
1246    }
1247}