1use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22 generic_conversion_single_value, generic_conversion_single_value_with_result,
23 primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29 Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30 Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31 TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::{ExtensionType, Uuid as UuidExtension};
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38 Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8]> {
46 match array.data_type() {
47 DataType::Binary => Some(array.as_binary::<i32>().value(index)),
48 DataType::LargeBinary => Some(array.as_binary::<i64>().value(index)),
49 DataType::BinaryView => Some(array.as_binary_view().value(index)),
50 _ => None,
51 }
52}
53
54pub(crate) fn variant_from_arrays_at<'m, 'v>(
57 metadata: &'m dyn Array,
58 value: &'v dyn Array,
59 index: usize,
60) -> Option<Variant<'m, 'v>> {
61 let metadata = binary_array_value(metadata, index)?;
62 let value = binary_array_value(value, index)?;
63 Some(Variant::new(metadata, value))
64}
65
66pub(crate) fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
68 match array.data_type() {
69 DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Ok(()),
70 _ => Err(ArrowError::InvalidArgumentError(format!(
71 "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}",
72 array.data_type()
73 ))),
74 }
75}
76
77pub struct VariantType;
82
83impl ExtensionType for VariantType {
84 const NAME: &'static str = "arrow.parquet.variant";
85
86 type Metadata = &'static str;
89
90 fn metadata(&self) -> &Self::Metadata {
91 &""
92 }
93
94 fn serialize_metadata(&self) -> Option<String> {
95 Some(String::new())
96 }
97
98 fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
99 Ok("")
100 }
101
102 fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
103 if matches!(data_type, DataType::Struct(_)) {
104 Ok(())
105 } else {
106 Err(ArrowError::InvalidArgumentError(format!(
107 "VariantType only supports StructArray, got {data_type}"
108 )))
109 }
110 }
111
112 fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
113 Self.supports_data_type(data_type)?;
114 Ok(Self)
115 }
116
117 fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<()> {
118 Self.supports_data_type(data_type)
119 }
120}
121
122#[derive(Debug, Clone)]
253pub struct VariantArray {
254 inner: StructArray,
256
257 metadata: ArrayRef,
259
260 shredding_state: ShreddingState,
262}
263
264impl VariantArray {
265 pub fn try_new(inner: &dyn Array) -> Result<Self> {
292 let inner = canonicalize_shredded_types(inner)?;
294
295 let Some(inner) = inner.as_struct_opt() else {
296 return Err(ArrowError::InvalidArgumentError(
297 "Invalid VariantArray: requires StructArray as input".to_string(),
298 ));
299 };
300
301 let Some(metadata_col) = inner.column_by_name("metadata") else {
305 return Err(ArrowError::InvalidArgumentError(
306 "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
307 ));
308 };
309 validate_binary_array(metadata_col.as_ref(), "metadata")?;
310
311 Ok(Self {
313 inner: inner.clone(),
314 metadata: metadata_col.clone(),
315 shredding_state: ShreddingState::try_from(inner)?,
316 })
317 }
318
319 pub(crate) fn from_parts(
320 metadata: ArrayRef,
321 value: Option<ArrayRef>,
322 typed_value: Option<ArrayRef>,
323 nulls: Option<NullBuffer>,
324 ) -> Self {
325 let mut builder = StructArrayBuilder::new().with_field("metadata", metadata.clone(), false);
326 if let Some(value) = value.clone() {
327 builder = builder.with_field("value", value, true);
328 }
329 if let Some(typed_value) = typed_value.clone() {
330 builder = builder.with_field_ref(typed_value_field(&typed_value), typed_value);
331 }
332 if let Some(nulls) = nulls {
333 builder = builder.with_nulls(nulls);
334 }
335
336 Self {
337 inner: builder.build(),
338 metadata,
339 shredding_state: ShreddingState::new(value, typed_value),
340 }
341 }
342
343 pub fn inner(&self) -> &StructArray {
345 &self.inner
346 }
347
348 pub fn into_inner(self) -> StructArray {
350 self.inner
351 }
352
353 pub fn shredding_state(&self) -> &ShreddingState {
355 &self.shredding_state
356 }
357
358 pub fn value(&self, index: usize) -> Variant<'_, '_> {
368 self.try_value(index).unwrap()
369 }
370
371 pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
399 match (self.typed_value_field(), self.value_field()) {
400 (Some(typed_value), value) if typed_value.is_valid(index) => {
402 typed_value_to_variant(typed_value, value, index)
403 }
404 (_, Some(value)) if value.is_valid(index) => variant_from_arrays_at(
406 &self.metadata,
407 value,
408 index,
409 )
410 .ok_or_else(|| {
411 ArrowError::InvalidArgumentError(format!(
412 "metadata and value fields must be binary-like arrays, instead got {} and {}",
413 self.metadata.data_type(),
414 value.data_type()
415 ))
416 }),
417 _ => Ok(Variant::Null),
420 }
421 }
422
423 pub fn metadata_field(&self) -> &ArrayRef {
425 &self.metadata
426 }
427
428 pub fn value_field(&self) -> Option<&ArrayRef> {
430 self.shredding_state.value_field()
431 }
432
433 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
435 self.shredding_state.typed_value_field()
436 }
437
438 pub fn field(&self, name: impl Into<String>) -> Field {
441 Field::new(
442 name.into(),
443 self.data_type().clone(),
444 self.inner.is_nullable(),
445 )
446 .with_extension_type(VariantType)
447 }
448
449 pub fn data_type(&self) -> &DataType {
451 self.inner.data_type()
452 }
453
454 pub fn slice(&self, offset: usize, length: usize) -> Self {
455 let inner = self.inner.slice(offset, length);
456 let metadata = self.metadata.slice(offset, length);
457 let shredding_state = self.shredding_state.slice(offset, length);
458 Self {
459 inner,
460 metadata,
461 shredding_state,
462 }
463 }
464
465 pub fn len(&self) -> usize {
466 self.inner.len()
467 }
468
469 pub fn is_empty(&self) -> bool {
470 self.inner.is_empty()
471 }
472
473 pub fn nulls(&self) -> Option<&NullBuffer> {
474 self.inner.nulls()
475 }
476
477 pub fn is_null(&self, index: usize) -> bool {
479 self.nulls().is_some_and(|n| n.is_null(index))
480 }
481
482 pub fn is_valid(&self, index: usize) -> bool {
484 !self.is_null(index)
485 }
486
487 pub fn iter(&self) -> VariantArrayIter<'_> {
489 VariantArrayIter::new(self)
490 }
491}
492
493impl PartialEq for VariantArray {
494 fn eq(&self, other: &Self) -> bool {
495 self.inner == other.inner
496 }
497}
498
499impl From<VariantArray> for StructArray {
500 fn from(variant_array: VariantArray) -> Self {
501 variant_array.into_inner()
502 }
503}
504
505impl From<VariantArray> for ArrayRef {
506 fn from(variant_array: VariantArray) -> Self {
507 Arc::new(variant_array.into_inner())
508 }
509}
510
511impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
512 fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
513 let iter = iter.into_iter();
514
515 let mut b = VariantArrayBuilder::new(iter.size_hint().0);
516 b.extend(iter);
517 b.build()
518 }
519}
520
521impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
522 fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
523 Self::from_iter(iter.into_iter().map(Some))
524 }
525}
526
527#[derive(Debug)]
552pub struct VariantArrayIter<'a> {
553 array: &'a VariantArray,
554 head_i: usize,
555 tail_i: usize,
556}
557
558impl<'a> VariantArrayIter<'a> {
559 pub fn new(array: &'a VariantArray) -> Self {
561 Self {
562 array,
563 head_i: 0,
564 tail_i: array.len(),
565 }
566 }
567
568 fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
569 self.array.is_valid(i).then(|| self.array.value(i))
570 }
571}
572
573impl<'a> Iterator for VariantArrayIter<'a> {
574 type Item = Option<Variant<'a, 'a>>;
575
576 #[inline]
577 fn next(&mut self) -> Option<Self::Item> {
578 if self.head_i == self.tail_i {
579 return None;
580 }
581
582 let out = self.value_opt(self.head_i);
583
584 self.head_i += 1;
585
586 Some(out)
587 }
588
589 fn size_hint(&self) -> (usize, Option<usize>) {
590 let remainder = self.tail_i - self.head_i;
591
592 (remainder, Some(remainder))
593 }
594}
595
596impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
597 fn next_back(&mut self) -> Option<Self::Item> {
598 if self.head_i == self.tail_i {
599 return None;
600 }
601
602 self.tail_i -= 1;
603
604 Some(self.value_opt(self.tail_i))
605 }
606}
607
608impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
609
610#[derive(Debug)]
645pub struct ShreddedVariantFieldArray {
646 inner: StructArray,
648 shredding_state: ShreddingState,
649}
650
651#[allow(unused)]
652impl ShreddedVariantFieldArray {
653 pub fn try_new(inner: &dyn Array) -> Result<Self> {
673 let Some(inner_struct) = inner.as_struct_opt() else {
674 return Err(ArrowError::InvalidArgumentError(
675 "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
676 ));
677 };
678
679 Ok(Self {
681 inner: inner_struct.clone(),
682 shredding_state: ShreddingState::try_from(inner_struct)?,
683 })
684 }
685
686 pub fn shredding_state(&self) -> &ShreddingState {
688 &self.shredding_state
689 }
690
691 pub fn value_field(&self) -> Option<&ArrayRef> {
693 self.shredding_state.value_field()
694 }
695
696 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
698 self.shredding_state.typed_value_field()
699 }
700
701 pub fn inner(&self) -> &StructArray {
703 &self.inner
704 }
705
706 pub(crate) fn from_parts(
707 value: Option<ArrayRef>,
708 typed_value: Option<ArrayRef>,
709 nulls: Option<NullBuffer>,
710 ) -> Self {
711 let mut builder = StructArrayBuilder::new();
712 if let Some(value) = value.clone() {
713 builder = builder.with_field("value", value, true);
714 }
715 if let Some(typed_value) = typed_value.clone() {
716 builder = builder.with_field_ref(typed_value_field(&typed_value), typed_value);
717 }
718 if let Some(nulls) = nulls {
719 builder = builder.with_nulls(nulls);
720 }
721
722 Self {
723 inner: builder.build(),
724 shredding_state: ShreddingState::new(value, typed_value),
725 }
726 }
727
728 pub fn into_inner(self) -> StructArray {
730 self.inner
731 }
732
733 pub fn data_type(&self) -> &DataType {
734 self.inner.data_type()
735 }
736
737 pub fn len(&self) -> usize {
738 self.inner.len()
739 }
740
741 pub fn is_empty(&self) -> bool {
742 self.inner.is_empty()
743 }
744
745 pub fn offset(&self) -> usize {
746 self.inner.offset()
747 }
748
749 pub fn nulls(&self) -> Option<&NullBuffer> {
750 None
754 }
755 pub fn is_null(&self, index: usize) -> bool {
757 self.nulls().is_some_and(|n| n.is_null(index))
758 }
759
760 pub fn is_valid(&self, index: usize) -> bool {
762 !self.is_null(index)
763 }
764}
765
766impl From<ShreddedVariantFieldArray> for ArrayRef {
767 fn from(array: ShreddedVariantFieldArray) -> Self {
768 Arc::new(array.into_inner())
769 }
770}
771
772impl From<ShreddedVariantFieldArray> for StructArray {
773 fn from(array: ShreddedVariantFieldArray) -> Self {
774 array.into_inner()
775 }
776}
777
778#[derive(Debug, Clone)]
812pub struct ShreddingState {
813 value: Option<ArrayRef>,
814 typed_value: Option<ArrayRef>,
815}
816
817impl ShreddingState {
818 pub fn new(value: Option<ArrayRef>, typed_value: Option<ArrayRef>) -> Self {
833 Self { value, typed_value }
834 }
835
836 pub fn value_field(&self) -> Option<&ArrayRef> {
838 self.value.as_ref()
839 }
840
841 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
843 self.typed_value.as_ref()
844 }
845
846 pub fn slice(&self, offset: usize, length: usize) -> Self {
848 Self {
849 value: self.value.as_ref().map(|v| v.slice(offset, length)),
850 typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
851 }
852 }
853}
854
855impl TryFrom<&StructArray> for ShreddingState {
856 type Error = ArrowError;
857
858 fn try_from(inner_struct: &StructArray) -> Result<Self> {
859 let value = if let Some(value_col) = inner_struct.column_by_name("value") {
861 validate_binary_array(value_col.as_ref(), "value")?;
862 Some(value_col.clone())
863 } else {
864 None
865 };
866 let typed_value = inner_struct.column_by_name("typed_value").cloned();
867 Ok(ShreddingState::new(value, typed_value))
868 }
869}
870
871fn typed_value_field(array: &ArrayRef) -> FieldRef {
877 let mut field = Field::new("typed_value", array.data_type().clone(), true);
878 if matches!(array.data_type(), DataType::FixedSizeBinary(16)) {
879 field = field.with_extension_type(UuidExtension);
880 }
881 Arc::new(field)
882}
883
884#[derive(Debug, Default, Clone)]
888pub(crate) struct StructArrayBuilder {
889 fields: Vec<FieldRef>,
890 arrays: Vec<ArrayRef>,
891 nulls: Option<NullBuffer>,
892}
893
894impl StructArrayBuilder {
895 pub fn new() -> Self {
896 Default::default()
897 }
898
899 pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
901 let field = Field::new(field_name, array.data_type().clone(), nullable);
902 self.fields.push(Arc::new(field));
903 self.arrays.push(array);
904 self
905 }
906
907 pub fn with_field_ref(mut self, field: FieldRef, array: ArrayRef) -> Self {
912 self.fields.push(field);
913 self.arrays.push(array);
914 self
915 }
916
917 pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
919 self.nulls = Some(nulls);
920 self
921 }
922
923 pub fn build(self) -> StructArray {
924 let Self {
925 fields,
926 arrays,
927 nulls,
928 } = self;
929 StructArray::new(Fields::from(fields), arrays, nulls)
930 }
931}
932
933fn typed_value_to_variant<'a>(
935 typed_value: &'a ArrayRef,
936 value: Option<&'a ArrayRef>,
937 index: usize,
938) -> Result<Variant<'a, 'a>> {
939 let data_type = typed_value.data_type();
940 if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
941 panic!("Invalid variant, conflicting value and typed_value");
943 }
944 match data_type {
945 DataType::Null => Ok(Variant::Null),
946 DataType::Boolean => {
947 let boolean_array = typed_value.as_boolean();
948 let value = boolean_array.value(index);
949 Ok(Variant::from(value))
950 }
951 DataType::FixedSizeBinary(16) => {
953 let array = typed_value.as_fixed_size_binary();
954 let value = array.value(index);
955 Ok(Uuid::from_slice(value).unwrap().into()) }
957 DataType::Binary => {
958 let array = typed_value.as_binary::<i32>();
959 let value = array.value(index);
960 Ok(Variant::from(value))
961 }
962 DataType::LargeBinary => {
963 let array = typed_value.as_binary::<i64>();
964 let value = array.value(index);
965 Ok(Variant::from(value))
966 }
967 DataType::BinaryView => {
968 let array = typed_value.as_binary_view();
969 let value = array.value(index);
970 Ok(Variant::from(value))
971 }
972 DataType::Utf8 => {
973 let array = typed_value.as_string::<i32>();
974 let value = array.value(index);
975 Ok(Variant::from(value))
976 }
977 DataType::LargeUtf8 => {
978 let array = typed_value.as_string::<i64>();
979 let value = array.value(index);
980 Ok(Variant::from(value))
981 }
982 DataType::Utf8View => {
983 let array = typed_value.as_string_view();
984 let value = array.value(index);
985 Ok(Variant::from(value))
986 }
987 DataType::Int8 => {
988 primitive_conversion_single_value!(Int8Type, typed_value, index)
989 }
990 DataType::Int16 => {
991 primitive_conversion_single_value!(Int16Type, typed_value, index)
992 }
993 DataType::Int32 => {
994 primitive_conversion_single_value!(Int32Type, typed_value, index)
995 }
996 DataType::Int64 => {
997 primitive_conversion_single_value!(Int64Type, typed_value, index)
998 }
999 DataType::Float16 => {
1000 primitive_conversion_single_value!(Float16Type, typed_value, index)
1001 }
1002 DataType::Float32 => {
1003 primitive_conversion_single_value!(Float32Type, typed_value, index)
1004 }
1005 DataType::Float64 => {
1006 primitive_conversion_single_value!(Float64Type, typed_value, index)
1007 }
1008 DataType::Decimal32(_, s) => {
1009 generic_conversion_single_value_with_result!(
1010 Decimal32Type,
1011 as_primitive,
1012 |v| VariantDecimal4::try_new(v, *s as u8),
1013 typed_value,
1014 index
1015 )
1016 }
1017 DataType::Decimal64(_, s) => {
1018 generic_conversion_single_value_with_result!(
1019 Decimal64Type,
1020 as_primitive,
1021 |v| VariantDecimal8::try_new(v, *s as u8),
1022 typed_value,
1023 index
1024 )
1025 }
1026 DataType::Decimal128(_, s) => {
1027 generic_conversion_single_value_with_result!(
1028 Decimal128Type,
1029 as_primitive,
1030 |v| VariantDecimal16::try_new(v, *s as u8),
1031 typed_value,
1032 index
1033 )
1034 }
1035 DataType::Date32 => {
1036 generic_conversion_single_value!(
1037 Date32Type,
1038 as_primitive,
1039 |v| Date32Type::to_naive_date_opt(v).unwrap(),
1040 typed_value,
1041 index
1042 )
1043 }
1044 DataType::Time64(TimeUnit::Microsecond) => {
1045 generic_conversion_single_value_with_result!(
1046 Time64MicrosecondType,
1047 as_primitive,
1048 |v| NaiveTime::from_num_seconds_from_midnight_opt(
1049 (v / 1_000_000) as u32,
1050 (v % 1_000_000) as u32 * 1000
1051 )
1052 .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1053 typed_value,
1054 index
1055 )
1056 }
1057 DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1058 generic_conversion_single_value!(
1059 TimestampMicrosecondType,
1060 as_primitive,
1061 |v| DateTime::from_timestamp_micros(v).unwrap(),
1062 typed_value,
1063 index
1064 )
1065 }
1066 DataType::Timestamp(TimeUnit::Microsecond, None) => {
1067 generic_conversion_single_value!(
1068 TimestampMicrosecondType,
1069 as_primitive,
1070 |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1071 typed_value,
1072 index
1073 )
1074 }
1075 DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1076 generic_conversion_single_value!(
1077 TimestampNanosecondType,
1078 as_primitive,
1079 DateTime::from_timestamp_nanos,
1080 typed_value,
1081 index
1082 )
1083 }
1084 DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1085 generic_conversion_single_value!(
1086 TimestampNanosecondType,
1087 as_primitive,
1088 |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1089 typed_value,
1090 index
1091 )
1092 }
1093 _ => {
1096 debug_assert!(
1100 false,
1101 "Unsupported typed_value type: {}",
1102 typed_value.data_type()
1103 );
1104 Ok(Variant::Null)
1105 }
1106 }
1107}
1108
1109fn canonicalize_shredded_types(array: &dyn Array) -> Result<ArrayRef> {
1112 let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1113 if let Cow::Borrowed(_) = new_type {
1114 if let Some(array) = array.as_struct_opt() {
1115 return Ok(Arc::new(array.clone())); }
1117 }
1118 cast(array, new_type.as_ref())
1119}
1120
1121fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1125 use DataType::*;
1126
1127 macro_rules! fail {
1129 () => {
1130 return Err(ArrowError::InvalidArgumentError(format!(
1131 "Illegal shredded value type: {data_type}"
1132 )))
1133 };
1134 }
1135 macro_rules! borrow {
1136 () => {
1137 Cow::Borrowed(data_type)
1138 };
1139 }
1140
1141 let new_data_type = match data_type {
1142 Null | Boolean => borrow!(),
1144 Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1145
1146 UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1148
1149 Decimal64(p, s) | Decimal128(p, s)
1154 if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1155 {
1156 Cow::Owned(Decimal32(*p, *s))
1157 }
1158 Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1159 Cow::Owned(Decimal64(*p, *s))
1160 }
1161 Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1162 Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1163 Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1164 Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1165
1166 Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1168 Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1169
1170 Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1172 Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1173
1174 Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1176
1177 FixedSizeBinary(16) => borrow!(),
1179 FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1180
1181 List(field) => match canonicalize_and_verify_field(field)? {
1183 Cow::Borrowed(_) => borrow!(),
1184 Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1185 },
1186 LargeList(field) => match canonicalize_and_verify_field(field)? {
1187 Cow::Borrowed(_) => borrow!(),
1188 Cow::Owned(new_field) => Cow::Owned(DataType::LargeList(new_field)),
1189 },
1190 ListView(field) => match canonicalize_and_verify_field(field)? {
1191 Cow::Borrowed(_) => borrow!(),
1192 Cow::Owned(new_field) => Cow::Owned(DataType::ListView(new_field)),
1193 },
1194 LargeListView(field) => match canonicalize_and_verify_field(field)? {
1195 Cow::Borrowed(_) => borrow!(),
1196 Cow::Owned(new_field) => Cow::Owned(DataType::LargeListView(new_field)),
1197 },
1198 Struct(fields) => {
1200 let mut new_fields = std::collections::HashMap::new();
1203 for (i, field) in fields.iter().enumerate() {
1204 if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1205 new_fields.insert(i, new_field);
1206 }
1207 }
1208
1209 if new_fields.is_empty() {
1210 borrow!()
1211 } else {
1212 let new_fields = fields
1213 .iter()
1214 .enumerate()
1215 .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1216 Cow::Owned(DataType::Struct(new_fields.collect()))
1217 }
1218 }
1219 Map(..) | Union(..) => fail!(),
1220
1221 Dictionary(..) | RunEndEncoded(..) => fail!(),
1223 };
1224 Ok(new_data_type)
1225}
1226
1227fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1228 let new_data_type = canonicalize_and_verify_data_type(field.data_type())?;
1229
1230 if matches!(new_data_type.as_ref(), DataType::FixedSizeBinary(16))
1234 && !field.has_valid_extension_type::<UuidExtension>()
1235 {
1236 let new_field = field.as_ref().clone().with_extension_type(UuidExtension);
1237 return Ok(Cow::Owned(Arc::new(new_field)));
1238 }
1239
1240 let Cow::Owned(new_data_type) = new_data_type else {
1241 return Ok(Cow::Borrowed(field));
1242 };
1243 let new_field = field.as_ref().clone().with_data_type(new_data_type);
1244 Ok(Cow::Owned(Arc::new(new_field)))
1245}
1246
1247#[cfg(test)]
1248mod test {
1249 use crate::VariantArrayBuilder;
1250 use std::str::FromStr;
1251
1252 use super::*;
1253 use arrow::array::{
1254 BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array,
1255 FixedSizeBinaryArray, Int32Array, Int64Array, LargeBinaryArray, LargeListArray,
1256 LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray,
1257 };
1258 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
1259 use arrow_schema::{Field, Fields};
1260 use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1261
1262 #[test]
1263 fn invalid_not_a_struct_array() {
1264 let array = make_binary_view_array();
1265 let err = VariantArray::try_new(&array);
1267 assert_eq!(
1268 err.unwrap_err().to_string(),
1269 "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1270 );
1271 }
1272
1273 #[test]
1274 fn invalid_missing_metadata() {
1275 let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1276 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1277 let err = VariantArray::try_new(&array);
1279 assert_eq!(
1280 err.unwrap_err().to_string(),
1281 "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1282 );
1283 }
1284
1285 #[test]
1286 fn all_null_missing_value_and_typed_value() {
1287 let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1288 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1289
1290 let variant_array = VariantArray::try_new(&array).unwrap();
1294
1295 assert!(matches!(
1297 variant_array.shredding_state(),
1298 ShreddingState {
1299 value: None,
1300 typed_value: None
1301 }
1302 ));
1303
1304 for i in 0..variant_array.len() {
1306 if variant_array.is_valid(i) {
1307 assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1308 }
1309 }
1310 }
1311
1312 #[test]
1313 fn invalid_metadata_field_type() {
1314 let fields = Fields::from(vec![
1315 Field::new("metadata", DataType::Int32, true), Field::new("value", DataType::BinaryView, true),
1317 ]);
1318 let array = StructArray::new(
1319 fields,
1320 vec![make_int32_array(), make_binary_view_array()],
1321 None,
1322 );
1323 let err = VariantArray::try_new(&array);
1324 assert_eq!(
1325 err.unwrap_err().to_string(),
1326 "Invalid argument error: VariantArray 'metadata' field must be Binary, LargeBinary, or BinaryView, got Int32"
1327 );
1328 }
1329
1330 #[test]
1331 fn invalid_value_field_type() {
1332 let fields = Fields::from(vec![
1333 Field::new("metadata", DataType::BinaryView, true),
1334 Field::new("value", DataType::Int32, true),
1335 ]);
1336 let array = StructArray::new(
1337 fields,
1338 vec![make_binary_view_array(), make_int32_array()],
1339 None,
1340 );
1341 let err = VariantArray::try_new(&array);
1342 assert_eq!(
1343 err.unwrap_err().to_string(),
1344 "Invalid argument error: VariantArray 'value' field must be Binary, LargeBinary, or BinaryView, got Int32"
1345 );
1346 }
1347
1348 fn make_binary_view_array() -> ArrayRef {
1349 Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1350 }
1351
1352 fn make_int32_array() -> ArrayRef {
1353 Arc::new(Int32Array::from(vec![1]))
1354 }
1355
1356 fn make_variant_struct_with_typed_value(typed_value: ArrayRef) -> StructArray {
1357 let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1358 EMPTY_VARIANT_METADATA_BYTES,
1359 typed_value.len(),
1360 ));
1361 StructArrayBuilder::new()
1362 .with_field("metadata", Arc::new(metadata), false)
1363 .with_field("typed_value", typed_value, true)
1364 .build()
1365 }
1366
1367 #[test]
1368 fn try_new_tags_untagged_uuid_on_read() {
1369 let typed_value = FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 2));
1372 let input = make_variant_struct_with_typed_value(Arc::new(typed_value.unwrap()));
1373
1374 let variant_array = VariantArray::try_new(&input).unwrap();
1376 let typed_value = variant_array.inner().field_by_name("typed_value").unwrap();
1377 assert_eq!(typed_value.data_type(), &DataType::FixedSizeBinary(16));
1378 assert!(typed_value.has_valid_extension_type::<UuidExtension>());
1379 }
1380
1381 #[test]
1382 fn try_new_tags_untagged_nested_uuid_on_read() {
1383 let leaf = FixedSizeBinaryArray::try_from_iter(std::iter::repeat_n([0u8; 16], 1)).unwrap();
1386 let inner = StructArrayBuilder::new()
1387 .with_field("typed_value", Arc::new(leaf), true)
1388 .build();
1389 let object = StructArrayBuilder::new()
1390 .with_field("id", Arc::new(inner), false)
1391 .build();
1392 let input = make_variant_struct_with_typed_value(Arc::new(object));
1393
1394 let variant_array = VariantArray::try_new(&input).unwrap();
1396 let object = variant_array.typed_value_field().unwrap().as_struct();
1397 let id = object.column_by_name("id").unwrap().as_struct();
1398 let uuid_leaf = id.field_by_name("typed_value").unwrap();
1399 assert!(uuid_leaf.has_valid_extension_type::<UuidExtension>());
1400 }
1401
1402 #[test]
1403 fn all_null_shredding_state() {
1404 assert!(matches!(
1406 ShreddingState::new(None, None),
1407 ShreddingState {
1408 value: None,
1409 typed_value: None
1410 }
1411 ));
1412 }
1413
1414 #[test]
1415 fn all_null_variant_array_construction() {
1416 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1417 let nulls = NullBuffer::from(vec![false, false, false]); let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1420 let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1421
1422 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1423
1424 assert!(matches!(
1426 variant_array.shredding_state(),
1427 ShreddingState {
1428 value: None,
1429 typed_value: None
1430 }
1431 ));
1432
1433 assert_eq!(variant_array.len(), 3);
1435 assert!(!variant_array.is_valid(0));
1436 assert!(!variant_array.is_valid(1));
1437 assert!(!variant_array.is_valid(2));
1438
1439 for i in 0..variant_array.len() {
1441 assert!(
1442 !variant_array.is_valid(i),
1443 "Expected value at index {i} to be null"
1444 );
1445 }
1446 }
1447
1448 #[test]
1449 fn value_field_present_but_all_null_should_be_unshredded() {
1450 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1453
1454 let value_nulls = NullBuffer::from(vec![false, false, false]); let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1457 let value_data = value_array
1458 .to_data()
1459 .into_builder()
1460 .nulls(Some(value_nulls))
1461 .build()
1462 .unwrap();
1463 let value = BinaryViewArray::from(value_data);
1464
1465 let fields = Fields::from(vec![
1466 Field::new("metadata", DataType::BinaryView, false),
1467 Field::new("value", DataType::BinaryView, true), ]);
1469 let struct_array = StructArray::new(
1470 fields,
1471 vec![Arc::new(metadata), Arc::new(value)],
1472 None, );
1474
1475 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1476
1477 assert!(matches!(
1479 variant_array.shredding_state(),
1480 ShreddingState {
1481 value: Some(_),
1482 typed_value: None
1483 }
1484 ));
1485 }
1486
1487 #[test]
1488 fn canonicalize_and_verify_list_like_data_types() {
1489 let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true));
1493 let make_large_binary = || Arc::new(Field::new("item", DataType::LargeBinary, true));
1494 let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true));
1495
1496 let cases = vec![
1497 DataType::LargeList(make_item_binary()),
1499 DataType::ListView(make_item_binary()),
1500 DataType::LargeListView(make_item_binary()),
1501 DataType::LargeList(make_large_binary()),
1503 DataType::ListView(make_large_binary()),
1504 DataType::LargeListView(make_large_binary()),
1505 DataType::LargeList(make_item_binary_view()),
1507 DataType::ListView(make_item_binary_view()),
1508 DataType::LargeListView(make_item_binary_view()),
1509 ];
1510
1511 for input in cases {
1512 assert_eq!(
1513 canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
1514 &input
1515 );
1516 }
1517 }
1518
1519 #[test]
1520 fn variant_array_try_new_supports_list_like_typed_value() {
1521 let item_field = Arc::new(Field::new("item", DataType::Int64, true));
1522 let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
1523
1524 let typed_values = vec![
1525 Arc::new(ListArray::new(
1526 item_field.clone(),
1527 OffsetBuffer::new(ScalarBuffer::from(vec![0, 2, 3])),
1528 values.clone(),
1529 None,
1530 )) as ArrayRef,
1531 Arc::new(LargeListArray::new(
1532 item_field.clone(),
1533 OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 3])),
1534 values.clone(),
1535 None,
1536 )) as ArrayRef,
1537 Arc::new(ListViewArray::new(
1538 item_field.clone(),
1539 ScalarBuffer::from(vec![0, 2]),
1540 ScalarBuffer::from(vec![2, 1]),
1541 values.clone(),
1542 None,
1543 )) as ArrayRef,
1544 Arc::new(LargeListViewArray::new(
1545 item_field,
1546 ScalarBuffer::from(vec![0_i64, 2]),
1547 ScalarBuffer::from(vec![2_i64, 1]),
1548 values,
1549 None,
1550 )) as ArrayRef,
1551 ];
1552
1553 for typed_value in typed_values {
1554 let input = make_variant_struct_with_typed_value(typed_value.clone());
1555 let variant_array = VariantArray::try_new(&input).unwrap();
1556 assert_eq!(
1557 variant_array.typed_value_field().unwrap().data_type(),
1558 typed_value.data_type(),
1559 );
1560 }
1561 }
1562
1563 #[test]
1564 fn test_variant_array_iterable() {
1565 let mut b = VariantArrayBuilder::new(6);
1566
1567 b.append_null();
1568 b.append_variant(Variant::from(1_i8));
1569 b.append_variant(Variant::Null);
1570 b.append_variant(Variant::from(2_i32));
1571 b.append_variant(Variant::from(3_i64));
1572 b.append_null();
1573
1574 let v = b.build();
1575
1576 let variants = v.iter().collect::<Vec<_>>();
1577
1578 assert_eq!(
1579 variants,
1580 vec![
1581 None,
1582 Some(Variant::Int8(1)),
1583 Some(Variant::Null),
1584 Some(Variant::Int32(2)),
1585 Some(Variant::Int64(3)),
1586 None,
1587 ]
1588 );
1589 }
1590
1591 #[test]
1592 fn test_variant_array_iter_double_ended() {
1593 let mut b = VariantArrayBuilder::new(5);
1594
1595 b.append_variant(Variant::from(0_i32));
1596 b.append_null();
1597 b.append_variant(Variant::from(2_i32));
1598 b.append_null();
1599 b.append_variant(Variant::from(4_i32));
1600
1601 let array = b.build();
1602 let mut iter = array.iter();
1603
1604 assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1605 assert_eq!(iter.next(), Some(None));
1606
1607 assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1608 assert_eq!(iter.next_back(), Some(None));
1609 assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1610
1611 assert_eq!(iter.next_back(), None);
1612 assert_eq!(iter.next(), None);
1613 }
1614
1615 #[test]
1616 fn test_variant_array_iter_reverse() {
1617 let mut b = VariantArrayBuilder::new(5);
1618
1619 b.append_variant(Variant::from("a"));
1620 b.append_null();
1621 b.append_variant(Variant::from("aaa"));
1622 b.append_null();
1623 b.append_variant(Variant::from("aaaaa"));
1624
1625 let array = b.build();
1626
1627 let result: Vec<_> = array.iter().rev().collect();
1628 assert_eq!(
1629 result,
1630 vec![
1631 Some(Variant::from("aaaaa")),
1632 None,
1633 Some(Variant::from("aaa")),
1634 None,
1635 Some(Variant::from("a")),
1636 ]
1637 );
1638 }
1639
1640 #[test]
1641 fn test_variant_array_iter_empty() {
1642 let v = VariantArrayBuilder::new(0).build();
1643 let mut i = v.iter();
1644 assert!(i.next().is_none());
1645 assert!(i.next_back().is_none());
1646 }
1647
1648 #[test]
1649 fn test_from_variant_opts_into_variant_array() {
1650 let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1651
1652 let variant_array = VariantArray::from_iter(v);
1653
1654 assert_eq!(variant_array.len(), 4);
1655
1656 assert!(variant_array.is_null(0));
1657
1658 assert!(!variant_array.is_null(1));
1659 assert_eq!(variant_array.value(1), Variant::Null);
1660
1661 assert!(!variant_array.is_null(2));
1662 assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1663
1664 assert!(variant_array.is_null(3));
1665 }
1666
1667 #[test]
1668 fn test_from_variants_into_variant_array() {
1669 let v = vec![
1670 Variant::Null,
1671 Variant::BooleanFalse,
1672 Variant::ShortString(ShortString::try_new("norm").unwrap()),
1673 ];
1674
1675 let variant_array = VariantArray::from_iter(v);
1676
1677 assert_eq!(variant_array.len(), 3);
1678
1679 assert!(!variant_array.is_null(0));
1680 assert_eq!(variant_array.value(0), Variant::Null);
1681
1682 assert!(!variant_array.is_null(1));
1683 assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1684
1685 assert!(!variant_array.is_null(2));
1686 assert_eq!(
1687 variant_array.value(2),
1688 Variant::ShortString(ShortString::try_new("norm").unwrap())
1689 );
1690 }
1691
1692 #[test]
1693 fn test_variant_equality() {
1694 let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1695 let v = VariantArray::from_iter(v_iter.clone());
1696
1697 {
1698 let v_copy = v.clone();
1699 assert_eq!(v, v_copy);
1700 }
1701
1702 {
1703 let v_iter_reversed = v_iter.iter().cloned().rev();
1704 let v_reversed = VariantArray::from_iter(v_iter_reversed);
1705
1706 assert_ne!(v, v_reversed);
1707 }
1708
1709 {
1710 let v_sliced = v.slice(0, 1);
1711 assert_ne!(v, v_sliced);
1712 }
1713 }
1714
1715 #[test]
1716 fn binary_typed_value_roundtrips() {
1717 let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1719 EMPTY_VARIANT_METADATA_BYTES,
1720 ]));
1721 let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello" as &[u8]]));
1722
1723 let struct_array = StructArrayBuilder::new()
1724 .with_field("metadata", metadata, false)
1725 .with_field("typed_value", typed_value, true)
1726 .build();
1727
1728 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1729 assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8]));
1730 }
1731
1732 #[test]
1733 fn large_binary_typed_value_roundtrips() {
1734 let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1736 EMPTY_VARIANT_METADATA_BYTES,
1737 ]));
1738 let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]]));
1739
1740 let struct_array = StructArrayBuilder::new()
1741 .with_field("metadata", metadata, false)
1742 .with_field("typed_value", typed_value, true)
1743 .build();
1744
1745 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1746 assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8]));
1747 }
1748
1749 macro_rules! invalid_variant_array_test {
1750 ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1751 #[test]
1752 fn $fn_name() {
1753 let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1754 EMPTY_VARIANT_METADATA_BYTES,
1755 1,
1756 ));
1757 let invalid_typed_value = $invalid_typed_value;
1758
1759 let struct_array = StructArrayBuilder::new()
1760 .with_field("metadata", Arc::new(metadata), false)
1761 .with_field("typed_value", Arc::new(invalid_typed_value), true)
1762 .build();
1763
1764 let array: VariantArray = VariantArray::try_new(&struct_array)
1765 .expect("should create variant array")
1766 .into();
1767
1768 let result = array.try_value(0);
1769 assert!(result.is_err());
1770 let error = result.unwrap_err();
1771 assert!(matches!(error, ArrowError::CastError(_)));
1772
1773 let expected: &str = $error_msg;
1774 assert!(
1775 error.to_string().contains($error_msg),
1776 "error `{}` did not contain `{}`",
1777 error,
1778 expected
1779 )
1780 }
1781 };
1782 }
1783
1784 invalid_variant_array_test!(
1785 test_variant_array_invalide_time,
1786 Time64MicrosecondArray::from(vec![Some(86401000000)]),
1787 "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1788 );
1789
1790 invalid_variant_array_test!(
1791 test_variant_array_invalid_decimal32,
1792 Decimal32Array::from(vec![Some(1234567890)]),
1793 "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1794 );
1795
1796 invalid_variant_array_test!(
1797 test_variant_array_invalid_decimal64,
1798 Decimal64Array::from(vec![Some(1234567890123456789)]),
1799 "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1800 );
1801
1802 invalid_variant_array_test!(
1803 test_variant_array_invalid_decimal128,
1804 Decimal128Array::from(vec![Some(
1805 i128::from_str("123456789012345678901234567890123456789").unwrap()
1806 ),]),
1807 "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1808 );
1809}