1use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22 generic_conversion_single_value, generic_conversion_single_value_with_result,
23 primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29 Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30 Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31 TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::ExtensionType;
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38 Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8]> {
46 match array.data_type() {
47 DataType::Binary => Some(array.as_binary::<i32>().value(index)),
48 DataType::LargeBinary => Some(array.as_binary::<i64>().value(index)),
49 DataType::BinaryView => Some(array.as_binary_view().value(index)),
50 _ => None,
51 }
52}
53
54pub(crate) fn variant_from_arrays_at<'m, 'v>(
57 metadata: &'m dyn Array,
58 value: &'v dyn Array,
59 index: usize,
60) -> Option<Variant<'m, 'v>> {
61 let metadata = binary_array_value(metadata, index)?;
62 let value = binary_array_value(value, index)?;
63 Some(Variant::new(metadata, value))
64}
65
66fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
68 match array.data_type() {
69 DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Ok(()),
70 _ => Err(ArrowError::InvalidArgumentError(format!(
71 "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}",
72 array.data_type()
73 ))),
74 }
75}
76
77pub struct VariantType;
82
83impl ExtensionType for VariantType {
84 const NAME: &'static str = "arrow.parquet.variant";
85
86 type Metadata = &'static str;
89
90 fn metadata(&self) -> &Self::Metadata {
91 &""
92 }
93
94 fn serialize_metadata(&self) -> Option<String> {
95 Some(String::new())
96 }
97
98 fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
99 Ok("")
100 }
101
102 fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
103 if matches!(data_type, DataType::Struct(_)) {
104 Ok(())
105 } else {
106 Err(ArrowError::InvalidArgumentError(format!(
107 "VariantType only supports StructArray, got {data_type}"
108 )))
109 }
110 }
111
112 fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
113 Self.supports_data_type(data_type)?;
114 Ok(Self)
115 }
116
117 fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<()> {
118 Self.supports_data_type(data_type)
119 }
120}
121
122#[derive(Debug, Clone)]
253pub struct VariantArray {
254 inner: StructArray,
256
257 metadata: ArrayRef,
259
260 shredding_state: ShreddingState,
262}
263
264impl VariantArray {
265 pub fn try_new(inner: &dyn Array) -> Result<Self> {
292 let inner = canonicalize_shredded_types(inner)?;
294
295 let Some(inner) = inner.as_struct_opt() else {
296 return Err(ArrowError::InvalidArgumentError(
297 "Invalid VariantArray: requires StructArray as input".to_string(),
298 ));
299 };
300
301 let Some(metadata_col) = inner.column_by_name("metadata") else {
305 return Err(ArrowError::InvalidArgumentError(
306 "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
307 ));
308 };
309 validate_binary_array(metadata_col.as_ref(), "metadata")?;
310
311 Ok(Self {
313 inner: inner.clone(),
314 metadata: metadata_col.clone(),
315 shredding_state: ShreddingState::try_from(inner)?,
316 })
317 }
318
319 pub(crate) fn from_parts(
320 metadata: ArrayRef,
321 value: Option<ArrayRef>,
322 typed_value: Option<ArrayRef>,
323 nulls: Option<NullBuffer>,
324 ) -> Self {
325 let mut builder = StructArrayBuilder::new().with_field("metadata", metadata.clone(), false);
326 if let Some(value) = value.clone() {
327 builder = builder.with_field("value", value, true);
328 }
329 if let Some(typed_value) = typed_value.clone() {
330 builder = builder.with_field("typed_value", typed_value, true);
331 }
332 if let Some(nulls) = nulls {
333 builder = builder.with_nulls(nulls);
334 }
335
336 Self {
337 inner: builder.build(),
338 metadata,
339 shredding_state: ShreddingState::new(value, typed_value),
340 }
341 }
342
343 pub fn inner(&self) -> &StructArray {
345 &self.inner
346 }
347
348 pub fn into_inner(self) -> StructArray {
350 self.inner
351 }
352
353 pub fn shredding_state(&self) -> &ShreddingState {
355 &self.shredding_state
356 }
357
358 pub fn value(&self, index: usize) -> Variant<'_, '_> {
368 self.try_value(index).unwrap()
369 }
370
371 pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
399 match (self.typed_value_field(), self.value_field()) {
400 (Some(typed_value), value) if typed_value.is_valid(index) => {
402 typed_value_to_variant(typed_value, value, index)
403 }
404 (_, Some(value)) if value.is_valid(index) => variant_from_arrays_at(
406 &self.metadata,
407 value,
408 index,
409 )
410 .ok_or_else(|| {
411 ArrowError::InvalidArgumentError(format!(
412 "metadata and value fields must be binary-like arrays, instead got {} and {}",
413 self.metadata.data_type(),
414 value.data_type()
415 ))
416 }),
417 _ => Ok(Variant::Null),
420 }
421 }
422
423 pub fn metadata_field(&self) -> &ArrayRef {
425 &self.metadata
426 }
427
428 pub fn value_field(&self) -> Option<&ArrayRef> {
430 self.shredding_state.value_field()
431 }
432
433 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
435 self.shredding_state.typed_value_field()
436 }
437
438 pub fn field(&self, name: impl Into<String>) -> Field {
441 Field::new(
442 name.into(),
443 self.data_type().clone(),
444 self.inner.is_nullable(),
445 )
446 .with_extension_type(VariantType)
447 }
448
449 pub fn data_type(&self) -> &DataType {
451 self.inner.data_type()
452 }
453
454 pub fn slice(&self, offset: usize, length: usize) -> Self {
455 let inner = self.inner.slice(offset, length);
456 let metadata = self.metadata.slice(offset, length);
457 let shredding_state = self.shredding_state.slice(offset, length);
458 Self {
459 inner,
460 metadata,
461 shredding_state,
462 }
463 }
464
465 pub fn len(&self) -> usize {
466 self.inner.len()
467 }
468
469 pub fn is_empty(&self) -> bool {
470 self.inner.is_empty()
471 }
472
473 pub fn nulls(&self) -> Option<&NullBuffer> {
474 self.inner.nulls()
475 }
476
477 pub fn is_null(&self, index: usize) -> bool {
479 self.nulls().is_some_and(|n| n.is_null(index))
480 }
481
482 pub fn is_valid(&self, index: usize) -> bool {
484 !self.is_null(index)
485 }
486
487 pub fn iter(&self) -> VariantArrayIter<'_> {
489 VariantArrayIter::new(self)
490 }
491}
492
493impl PartialEq for VariantArray {
494 fn eq(&self, other: &Self) -> bool {
495 self.inner == other.inner
496 }
497}
498
499impl From<VariantArray> for StructArray {
500 fn from(variant_array: VariantArray) -> Self {
501 variant_array.into_inner()
502 }
503}
504
505impl From<VariantArray> for ArrayRef {
506 fn from(variant_array: VariantArray) -> Self {
507 Arc::new(variant_array.into_inner())
508 }
509}
510
511impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
512 fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
513 let iter = iter.into_iter();
514
515 let mut b = VariantArrayBuilder::new(iter.size_hint().0);
516 b.extend(iter);
517 b.build()
518 }
519}
520
521impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
522 fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
523 Self::from_iter(iter.into_iter().map(Some))
524 }
525}
526
527#[derive(Debug)]
552pub struct VariantArrayIter<'a> {
553 array: &'a VariantArray,
554 head_i: usize,
555 tail_i: usize,
556}
557
558impl<'a> VariantArrayIter<'a> {
559 pub fn new(array: &'a VariantArray) -> Self {
561 Self {
562 array,
563 head_i: 0,
564 tail_i: array.len(),
565 }
566 }
567
568 fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
569 self.array.is_valid(i).then(|| self.array.value(i))
570 }
571}
572
573impl<'a> Iterator for VariantArrayIter<'a> {
574 type Item = Option<Variant<'a, 'a>>;
575
576 #[inline]
577 fn next(&mut self) -> Option<Self::Item> {
578 if self.head_i == self.tail_i {
579 return None;
580 }
581
582 let out = self.value_opt(self.head_i);
583
584 self.head_i += 1;
585
586 Some(out)
587 }
588
589 fn size_hint(&self) -> (usize, Option<usize>) {
590 let remainder = self.tail_i - self.head_i;
591
592 (remainder, Some(remainder))
593 }
594}
595
596impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
597 fn next_back(&mut self) -> Option<Self::Item> {
598 if self.head_i == self.tail_i {
599 return None;
600 }
601
602 self.tail_i -= 1;
603
604 Some(self.value_opt(self.tail_i))
605 }
606}
607
608impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
609
610#[derive(Debug)]
645pub struct ShreddedVariantFieldArray {
646 inner: StructArray,
648 shredding_state: ShreddingState,
649}
650
651#[allow(unused)]
652impl ShreddedVariantFieldArray {
653 pub fn try_new(inner: &dyn Array) -> Result<Self> {
673 let Some(inner_struct) = inner.as_struct_opt() else {
674 return Err(ArrowError::InvalidArgumentError(
675 "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
676 ));
677 };
678
679 Ok(Self {
681 inner: inner_struct.clone(),
682 shredding_state: ShreddingState::try_from(inner_struct)?,
683 })
684 }
685
686 pub fn shredding_state(&self) -> &ShreddingState {
688 &self.shredding_state
689 }
690
691 pub fn value_field(&self) -> Option<&ArrayRef> {
693 self.shredding_state.value_field()
694 }
695
696 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
698 self.shredding_state.typed_value_field()
699 }
700
701 pub fn inner(&self) -> &StructArray {
703 &self.inner
704 }
705
706 pub(crate) fn from_parts(
707 value: Option<ArrayRef>,
708 typed_value: Option<ArrayRef>,
709 nulls: Option<NullBuffer>,
710 ) -> Self {
711 let mut builder = StructArrayBuilder::new();
712 if let Some(value) = value.clone() {
713 builder = builder.with_field("value", value, true);
714 }
715 if let Some(typed_value) = typed_value.clone() {
716 builder = builder.with_field("typed_value", typed_value, true);
717 }
718 if let Some(nulls) = nulls {
719 builder = builder.with_nulls(nulls);
720 }
721
722 Self {
723 inner: builder.build(),
724 shredding_state: ShreddingState::new(value, typed_value),
725 }
726 }
727
728 pub fn into_inner(self) -> StructArray {
730 self.inner
731 }
732
733 pub fn data_type(&self) -> &DataType {
734 self.inner.data_type()
735 }
736
737 pub fn len(&self) -> usize {
738 self.inner.len()
739 }
740
741 pub fn is_empty(&self) -> bool {
742 self.inner.is_empty()
743 }
744
745 pub fn offset(&self) -> usize {
746 self.inner.offset()
747 }
748
749 pub fn nulls(&self) -> Option<&NullBuffer> {
750 None
754 }
755 pub fn is_null(&self, index: usize) -> bool {
757 self.nulls().is_some_and(|n| n.is_null(index))
758 }
759
760 pub fn is_valid(&self, index: usize) -> bool {
762 !self.is_null(index)
763 }
764}
765
766impl From<ShreddedVariantFieldArray> for ArrayRef {
767 fn from(array: ShreddedVariantFieldArray) -> Self {
768 Arc::new(array.into_inner())
769 }
770}
771
772impl From<ShreddedVariantFieldArray> for StructArray {
773 fn from(array: ShreddedVariantFieldArray) -> Self {
774 array.into_inner()
775 }
776}
777
778#[derive(Debug, Clone)]
812pub struct ShreddingState {
813 value: Option<ArrayRef>,
814 typed_value: Option<ArrayRef>,
815}
816
817impl ShreddingState {
818 pub fn new(value: Option<ArrayRef>, typed_value: Option<ArrayRef>) -> Self {
833 Self { value, typed_value }
834 }
835
836 pub fn value_field(&self) -> Option<&ArrayRef> {
838 self.value.as_ref()
839 }
840
841 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
843 self.typed_value.as_ref()
844 }
845
846 pub fn borrow(&self) -> BorrowedShreddingState<'_> {
848 BorrowedShreddingState {
849 value: self.value_field(),
850 typed_value: self.typed_value_field(),
851 }
852 }
853
854 pub fn slice(&self, offset: usize, length: usize) -> Self {
856 Self {
857 value: self.value.as_ref().map(|v| v.slice(offset, length)),
858 typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
859 }
860 }
861}
862
863#[derive(Clone, Debug)]
866pub struct BorrowedShreddingState<'a> {
867 value: Option<&'a ArrayRef>,
868 typed_value: Option<&'a ArrayRef>,
869}
870
871impl<'a> BorrowedShreddingState<'a> {
872 pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>) -> Self {
887 Self { value, typed_value }
888 }
889
890 pub fn value_field(&self) -> Option<&'a ArrayRef> {
892 self.value
893 }
894
895 pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
897 self.typed_value
898 }
899}
900
901impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
902 type Error = ArrowError;
903
904 fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
905 let value = if let Some(value_col) = inner_struct.column_by_name("value") {
907 validate_binary_array(value_col.as_ref(), "value")?;
908 Some(value_col)
909 } else {
910 None
911 };
912 let typed_value = inner_struct.column_by_name("typed_value");
913 Ok(BorrowedShreddingState::new(value, typed_value))
914 }
915}
916
917impl TryFrom<&StructArray> for ShreddingState {
918 type Error = ArrowError;
919
920 fn try_from(inner_struct: &StructArray) -> Result<Self> {
921 Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
922 }
923}
924
925impl From<BorrowedShreddingState<'_>> for ShreddingState {
926 fn from(state: BorrowedShreddingState<'_>) -> Self {
927 ShreddingState {
928 value: state.value_field().cloned(),
929 typed_value: state.typed_value_field().cloned(),
930 }
931 }
932}
933
934#[derive(Debug, Default, Clone)]
938pub(crate) struct StructArrayBuilder {
939 fields: Vec<FieldRef>,
940 arrays: Vec<ArrayRef>,
941 nulls: Option<NullBuffer>,
942}
943
944impl StructArrayBuilder {
945 pub fn new() -> Self {
946 Default::default()
947 }
948
949 pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
951 let field = Field::new(field_name, array.data_type().clone(), nullable);
952 self.fields.push(Arc::new(field));
953 self.arrays.push(array);
954 self
955 }
956
957 pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
959 self.nulls = Some(nulls);
960 self
961 }
962
963 pub fn build(self) -> StructArray {
964 let Self {
965 fields,
966 arrays,
967 nulls,
968 } = self;
969 StructArray::new(Fields::from(fields), arrays, nulls)
970 }
971}
972
973fn typed_value_to_variant<'a>(
975 typed_value: &'a ArrayRef,
976 value: Option<&'a ArrayRef>,
977 index: usize,
978) -> Result<Variant<'a, 'a>> {
979 let data_type = typed_value.data_type();
980 if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
981 panic!("Invalid variant, conflicting value and typed_value");
983 }
984 match data_type {
985 DataType::Null => Ok(Variant::Null),
986 DataType::Boolean => {
987 let boolean_array = typed_value.as_boolean();
988 let value = boolean_array.value(index);
989 Ok(Variant::from(value))
990 }
991 DataType::FixedSizeBinary(16) => {
993 let array = typed_value.as_fixed_size_binary();
994 let value = array.value(index);
995 Ok(Uuid::from_slice(value).unwrap().into()) }
997 DataType::Binary => {
998 let array = typed_value.as_binary::<i32>();
999 let value = array.value(index);
1000 Ok(Variant::from(value))
1001 }
1002 DataType::LargeBinary => {
1003 let array = typed_value.as_binary::<i64>();
1004 let value = array.value(index);
1005 Ok(Variant::from(value))
1006 }
1007 DataType::BinaryView => {
1008 let array = typed_value.as_binary_view();
1009 let value = array.value(index);
1010 Ok(Variant::from(value))
1011 }
1012 DataType::Utf8 => {
1013 let array = typed_value.as_string::<i32>();
1014 let value = array.value(index);
1015 Ok(Variant::from(value))
1016 }
1017 DataType::LargeUtf8 => {
1018 let array = typed_value.as_string::<i64>();
1019 let value = array.value(index);
1020 Ok(Variant::from(value))
1021 }
1022 DataType::Utf8View => {
1023 let array = typed_value.as_string_view();
1024 let value = array.value(index);
1025 Ok(Variant::from(value))
1026 }
1027 DataType::Int8 => {
1028 primitive_conversion_single_value!(Int8Type, typed_value, index)
1029 }
1030 DataType::Int16 => {
1031 primitive_conversion_single_value!(Int16Type, typed_value, index)
1032 }
1033 DataType::Int32 => {
1034 primitive_conversion_single_value!(Int32Type, typed_value, index)
1035 }
1036 DataType::Int64 => {
1037 primitive_conversion_single_value!(Int64Type, typed_value, index)
1038 }
1039 DataType::Float16 => {
1040 primitive_conversion_single_value!(Float16Type, typed_value, index)
1041 }
1042 DataType::Float32 => {
1043 primitive_conversion_single_value!(Float32Type, typed_value, index)
1044 }
1045 DataType::Float64 => {
1046 primitive_conversion_single_value!(Float64Type, typed_value, index)
1047 }
1048 DataType::Decimal32(_, s) => {
1049 generic_conversion_single_value_with_result!(
1050 Decimal32Type,
1051 as_primitive,
1052 |v| VariantDecimal4::try_new(v, *s as u8),
1053 typed_value,
1054 index
1055 )
1056 }
1057 DataType::Decimal64(_, s) => {
1058 generic_conversion_single_value_with_result!(
1059 Decimal64Type,
1060 as_primitive,
1061 |v| VariantDecimal8::try_new(v, *s as u8),
1062 typed_value,
1063 index
1064 )
1065 }
1066 DataType::Decimal128(_, s) => {
1067 generic_conversion_single_value_with_result!(
1068 Decimal128Type,
1069 as_primitive,
1070 |v| VariantDecimal16::try_new(v, *s as u8),
1071 typed_value,
1072 index
1073 )
1074 }
1075 DataType::Date32 => {
1076 generic_conversion_single_value!(
1077 Date32Type,
1078 as_primitive,
1079 |v| Date32Type::to_naive_date_opt(v).unwrap(),
1080 typed_value,
1081 index
1082 )
1083 }
1084 DataType::Time64(TimeUnit::Microsecond) => {
1085 generic_conversion_single_value_with_result!(
1086 Time64MicrosecondType,
1087 as_primitive,
1088 |v| NaiveTime::from_num_seconds_from_midnight_opt(
1089 (v / 1_000_000) as u32,
1090 (v % 1_000_000) as u32 * 1000
1091 )
1092 .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1093 typed_value,
1094 index
1095 )
1096 }
1097 DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1098 generic_conversion_single_value!(
1099 TimestampMicrosecondType,
1100 as_primitive,
1101 |v| DateTime::from_timestamp_micros(v).unwrap(),
1102 typed_value,
1103 index
1104 )
1105 }
1106 DataType::Timestamp(TimeUnit::Microsecond, None) => {
1107 generic_conversion_single_value!(
1108 TimestampMicrosecondType,
1109 as_primitive,
1110 |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1111 typed_value,
1112 index
1113 )
1114 }
1115 DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1116 generic_conversion_single_value!(
1117 TimestampNanosecondType,
1118 as_primitive,
1119 DateTime::from_timestamp_nanos,
1120 typed_value,
1121 index
1122 )
1123 }
1124 DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1125 generic_conversion_single_value!(
1126 TimestampNanosecondType,
1127 as_primitive,
1128 |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1129 typed_value,
1130 index
1131 )
1132 }
1133 _ => {
1136 debug_assert!(
1140 false,
1141 "Unsupported typed_value type: {}",
1142 typed_value.data_type()
1143 );
1144 Ok(Variant::Null)
1145 }
1146 }
1147}
1148
1149fn canonicalize_shredded_types(array: &dyn Array) -> Result<ArrayRef> {
1152 let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1153 if let Cow::Borrowed(_) = new_type {
1154 if let Some(array) = array.as_struct_opt() {
1155 return Ok(Arc::new(array.clone())); }
1157 }
1158 cast(array, new_type.as_ref())
1159}
1160
1161fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1165 use DataType::*;
1166
1167 macro_rules! fail {
1169 () => {
1170 return Err(ArrowError::InvalidArgumentError(format!(
1171 "Illegal shredded value type: {data_type}"
1172 )))
1173 };
1174 }
1175 macro_rules! borrow {
1176 () => {
1177 Cow::Borrowed(data_type)
1178 };
1179 }
1180
1181 let new_data_type = match data_type {
1182 Null | Boolean => borrow!(),
1184 Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1185
1186 UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1188
1189 Decimal64(p, s) | Decimal128(p, s)
1194 if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1195 {
1196 Cow::Owned(Decimal32(*p, *s))
1197 }
1198 Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1199 Cow::Owned(Decimal64(*p, *s))
1200 }
1201 Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1202 Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1203 Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1204 Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1205
1206 Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1208 Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1209
1210 Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1212 Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1213
1214 Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1216
1217 FixedSizeBinary(16) => borrow!(),
1219 FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1220
1221 List(field) => match canonicalize_and_verify_field(field)? {
1223 Cow::Borrowed(_) => borrow!(),
1224 Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1225 },
1226 LargeList(field) => match canonicalize_and_verify_field(field)? {
1227 Cow::Borrowed(_) => borrow!(),
1228 Cow::Owned(new_field) => Cow::Owned(DataType::LargeList(new_field)),
1229 },
1230 ListView(field) => match canonicalize_and_verify_field(field)? {
1231 Cow::Borrowed(_) => borrow!(),
1232 Cow::Owned(new_field) => Cow::Owned(DataType::ListView(new_field)),
1233 },
1234 LargeListView(field) => match canonicalize_and_verify_field(field)? {
1235 Cow::Borrowed(_) => borrow!(),
1236 Cow::Owned(new_field) => Cow::Owned(DataType::LargeListView(new_field)),
1237 },
1238 Struct(fields) => {
1240 let mut new_fields = std::collections::HashMap::new();
1243 for (i, field) in fields.iter().enumerate() {
1244 if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1245 new_fields.insert(i, new_field);
1246 }
1247 }
1248
1249 if new_fields.is_empty() {
1250 borrow!()
1251 } else {
1252 let new_fields = fields
1253 .iter()
1254 .enumerate()
1255 .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1256 Cow::Owned(DataType::Struct(new_fields.collect()))
1257 }
1258 }
1259 Map(..) | Union(..) => fail!(),
1260
1261 Dictionary(..) | RunEndEncoded(..) => fail!(),
1263 };
1264 Ok(new_data_type)
1265}
1266
1267fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1268 let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1269 return Ok(Cow::Borrowed(field));
1270 };
1271 let new_field = field.as_ref().clone().with_data_type(new_data_type);
1272 Ok(Cow::Owned(Arc::new(new_field)))
1273}
1274
1275#[cfg(test)]
1276mod test {
1277 use crate::VariantArrayBuilder;
1278 use std::str::FromStr;
1279
1280 use super::*;
1281 use arrow::array::{
1282 BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array,
1283 Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, ListArray, ListViewArray,
1284 Time64MicrosecondArray,
1285 };
1286 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
1287 use arrow_schema::{Field, Fields};
1288 use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1289
1290 #[test]
1291 fn invalid_not_a_struct_array() {
1292 let array = make_binary_view_array();
1293 let err = VariantArray::try_new(&array);
1295 assert_eq!(
1296 err.unwrap_err().to_string(),
1297 "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1298 );
1299 }
1300
1301 #[test]
1302 fn invalid_missing_metadata() {
1303 let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1304 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1305 let err = VariantArray::try_new(&array);
1307 assert_eq!(
1308 err.unwrap_err().to_string(),
1309 "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1310 );
1311 }
1312
1313 #[test]
1314 fn all_null_missing_value_and_typed_value() {
1315 let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1316 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1317
1318 let variant_array = VariantArray::try_new(&array).unwrap();
1322
1323 assert!(matches!(
1325 variant_array.shredding_state(),
1326 ShreddingState {
1327 value: None,
1328 typed_value: None
1329 }
1330 ));
1331
1332 for i in 0..variant_array.len() {
1334 if variant_array.is_valid(i) {
1335 assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1336 }
1337 }
1338 }
1339
1340 #[test]
1341 fn invalid_metadata_field_type() {
1342 let fields = Fields::from(vec![
1343 Field::new("metadata", DataType::Int32, true), Field::new("value", DataType::BinaryView, true),
1345 ]);
1346 let array = StructArray::new(
1347 fields,
1348 vec![make_int32_array(), make_binary_view_array()],
1349 None,
1350 );
1351 let err = VariantArray::try_new(&array);
1352 assert_eq!(
1353 err.unwrap_err().to_string(),
1354 "Invalid argument error: VariantArray 'metadata' field must be Binary, LargeBinary, or BinaryView, got Int32"
1355 );
1356 }
1357
1358 #[test]
1359 fn invalid_value_field_type() {
1360 let fields = Fields::from(vec![
1361 Field::new("metadata", DataType::BinaryView, true),
1362 Field::new("value", DataType::Int32, true),
1363 ]);
1364 let array = StructArray::new(
1365 fields,
1366 vec![make_binary_view_array(), make_int32_array()],
1367 None,
1368 );
1369 let err = VariantArray::try_new(&array);
1370 assert_eq!(
1371 err.unwrap_err().to_string(),
1372 "Invalid argument error: VariantArray 'value' field must be Binary, LargeBinary, or BinaryView, got Int32"
1373 );
1374 }
1375
1376 fn make_binary_view_array() -> ArrayRef {
1377 Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1378 }
1379
1380 fn make_int32_array() -> ArrayRef {
1381 Arc::new(Int32Array::from(vec![1]))
1382 }
1383
1384 fn make_variant_struct_with_typed_value(typed_value: ArrayRef) -> StructArray {
1385 let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1386 EMPTY_VARIANT_METADATA_BYTES,
1387 typed_value.len(),
1388 ));
1389 StructArrayBuilder::new()
1390 .with_field("metadata", Arc::new(metadata), false)
1391 .with_field("typed_value", typed_value, true)
1392 .build()
1393 }
1394
1395 #[test]
1396 fn all_null_shredding_state() {
1397 assert!(matches!(
1399 ShreddingState::new(None, None),
1400 ShreddingState {
1401 value: None,
1402 typed_value: None
1403 }
1404 ));
1405 }
1406
1407 #[test]
1408 fn all_null_variant_array_construction() {
1409 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1410 let nulls = NullBuffer::from(vec![false, false, false]); let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1413 let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1414
1415 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1416
1417 assert!(matches!(
1419 variant_array.shredding_state(),
1420 ShreddingState {
1421 value: None,
1422 typed_value: None
1423 }
1424 ));
1425
1426 assert_eq!(variant_array.len(), 3);
1428 assert!(!variant_array.is_valid(0));
1429 assert!(!variant_array.is_valid(1));
1430 assert!(!variant_array.is_valid(2));
1431
1432 for i in 0..variant_array.len() {
1434 assert!(
1435 !variant_array.is_valid(i),
1436 "Expected value at index {i} to be null"
1437 );
1438 }
1439 }
1440
1441 #[test]
1442 fn value_field_present_but_all_null_should_be_unshredded() {
1443 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1446
1447 let value_nulls = NullBuffer::from(vec![false, false, false]); let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1450 let value_data = value_array
1451 .to_data()
1452 .into_builder()
1453 .nulls(Some(value_nulls))
1454 .build()
1455 .unwrap();
1456 let value = BinaryViewArray::from(value_data);
1457
1458 let fields = Fields::from(vec![
1459 Field::new("metadata", DataType::BinaryView, false),
1460 Field::new("value", DataType::BinaryView, true), ]);
1462 let struct_array = StructArray::new(
1463 fields,
1464 vec![Arc::new(metadata), Arc::new(value)],
1465 None, );
1467
1468 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1469
1470 assert!(matches!(
1472 variant_array.shredding_state(),
1473 ShreddingState {
1474 value: Some(_),
1475 typed_value: None
1476 }
1477 ));
1478 }
1479
1480 #[test]
1481 fn canonicalize_and_verify_list_like_data_types() {
1482 let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true));
1486 let make_large_binary = || Arc::new(Field::new("item", DataType::LargeBinary, true));
1487 let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true));
1488
1489 let cases = vec![
1490 DataType::LargeList(make_item_binary()),
1492 DataType::ListView(make_item_binary()),
1493 DataType::LargeListView(make_item_binary()),
1494 DataType::LargeList(make_large_binary()),
1496 DataType::ListView(make_large_binary()),
1497 DataType::LargeListView(make_large_binary()),
1498 DataType::LargeList(make_item_binary_view()),
1500 DataType::ListView(make_item_binary_view()),
1501 DataType::LargeListView(make_item_binary_view()),
1502 ];
1503
1504 for input in cases {
1505 assert_eq!(
1506 canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
1507 &input
1508 );
1509 }
1510 }
1511
1512 #[test]
1513 fn variant_array_try_new_supports_list_like_typed_value() {
1514 let item_field = Arc::new(Field::new("item", DataType::Int64, true));
1515 let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
1516
1517 let typed_values = vec![
1518 Arc::new(ListArray::new(
1519 item_field.clone(),
1520 OffsetBuffer::new(ScalarBuffer::from(vec![0, 2, 3])),
1521 values.clone(),
1522 None,
1523 )) as ArrayRef,
1524 Arc::new(LargeListArray::new(
1525 item_field.clone(),
1526 OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 3])),
1527 values.clone(),
1528 None,
1529 )) as ArrayRef,
1530 Arc::new(ListViewArray::new(
1531 item_field.clone(),
1532 ScalarBuffer::from(vec![0, 2]),
1533 ScalarBuffer::from(vec![2, 1]),
1534 values.clone(),
1535 None,
1536 )) as ArrayRef,
1537 Arc::new(LargeListViewArray::new(
1538 item_field,
1539 ScalarBuffer::from(vec![0_i64, 2]),
1540 ScalarBuffer::from(vec![2_i64, 1]),
1541 values,
1542 None,
1543 )) as ArrayRef,
1544 ];
1545
1546 for typed_value in typed_values {
1547 let input = make_variant_struct_with_typed_value(typed_value.clone());
1548 let variant_array = VariantArray::try_new(&input).unwrap();
1549 assert_eq!(
1550 variant_array.typed_value_field().unwrap().data_type(),
1551 typed_value.data_type(),
1552 );
1553 }
1554 }
1555
1556 #[test]
1557 fn test_variant_array_iterable() {
1558 let mut b = VariantArrayBuilder::new(6);
1559
1560 b.append_null();
1561 b.append_variant(Variant::from(1_i8));
1562 b.append_variant(Variant::Null);
1563 b.append_variant(Variant::from(2_i32));
1564 b.append_variant(Variant::from(3_i64));
1565 b.append_null();
1566
1567 let v = b.build();
1568
1569 let variants = v.iter().collect::<Vec<_>>();
1570
1571 assert_eq!(
1572 variants,
1573 vec![
1574 None,
1575 Some(Variant::Int8(1)),
1576 Some(Variant::Null),
1577 Some(Variant::Int32(2)),
1578 Some(Variant::Int64(3)),
1579 None,
1580 ]
1581 );
1582 }
1583
1584 #[test]
1585 fn test_variant_array_iter_double_ended() {
1586 let mut b = VariantArrayBuilder::new(5);
1587
1588 b.append_variant(Variant::from(0_i32));
1589 b.append_null();
1590 b.append_variant(Variant::from(2_i32));
1591 b.append_null();
1592 b.append_variant(Variant::from(4_i32));
1593
1594 let array = b.build();
1595 let mut iter = array.iter();
1596
1597 assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1598 assert_eq!(iter.next(), Some(None));
1599
1600 assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1601 assert_eq!(iter.next_back(), Some(None));
1602 assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1603
1604 assert_eq!(iter.next_back(), None);
1605 assert_eq!(iter.next(), None);
1606 }
1607
1608 #[test]
1609 fn test_variant_array_iter_reverse() {
1610 let mut b = VariantArrayBuilder::new(5);
1611
1612 b.append_variant(Variant::from("a"));
1613 b.append_null();
1614 b.append_variant(Variant::from("aaa"));
1615 b.append_null();
1616 b.append_variant(Variant::from("aaaaa"));
1617
1618 let array = b.build();
1619
1620 let result: Vec<_> = array.iter().rev().collect();
1621 assert_eq!(
1622 result,
1623 vec![
1624 Some(Variant::from("aaaaa")),
1625 None,
1626 Some(Variant::from("aaa")),
1627 None,
1628 Some(Variant::from("a")),
1629 ]
1630 );
1631 }
1632
1633 #[test]
1634 fn test_variant_array_iter_empty() {
1635 let v = VariantArrayBuilder::new(0).build();
1636 let mut i = v.iter();
1637 assert!(i.next().is_none());
1638 assert!(i.next_back().is_none());
1639 }
1640
1641 #[test]
1642 fn test_from_variant_opts_into_variant_array() {
1643 let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1644
1645 let variant_array = VariantArray::from_iter(v);
1646
1647 assert_eq!(variant_array.len(), 4);
1648
1649 assert!(variant_array.is_null(0));
1650
1651 assert!(!variant_array.is_null(1));
1652 assert_eq!(variant_array.value(1), Variant::Null);
1653
1654 assert!(!variant_array.is_null(2));
1655 assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1656
1657 assert!(variant_array.is_null(3));
1658 }
1659
1660 #[test]
1661 fn test_from_variants_into_variant_array() {
1662 let v = vec![
1663 Variant::Null,
1664 Variant::BooleanFalse,
1665 Variant::ShortString(ShortString::try_new("norm").unwrap()),
1666 ];
1667
1668 let variant_array = VariantArray::from_iter(v);
1669
1670 assert_eq!(variant_array.len(), 3);
1671
1672 assert!(!variant_array.is_null(0));
1673 assert_eq!(variant_array.value(0), Variant::Null);
1674
1675 assert!(!variant_array.is_null(1));
1676 assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1677
1678 assert!(!variant_array.is_null(2));
1679 assert_eq!(
1680 variant_array.value(2),
1681 Variant::ShortString(ShortString::try_new("norm").unwrap())
1682 );
1683 }
1684
1685 #[test]
1686 fn test_variant_equality() {
1687 let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1688 let v = VariantArray::from_iter(v_iter.clone());
1689
1690 {
1691 let v_copy = v.clone();
1692 assert_eq!(v, v_copy);
1693 }
1694
1695 {
1696 let v_iter_reversed = v_iter.iter().cloned().rev();
1697 let v_reversed = VariantArray::from_iter(v_iter_reversed);
1698
1699 assert_ne!(v, v_reversed);
1700 }
1701
1702 {
1703 let v_sliced = v.slice(0, 1);
1704 assert_ne!(v, v_sliced);
1705 }
1706 }
1707
1708 #[test]
1709 fn binary_typed_value_roundtrips() {
1710 let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1712 EMPTY_VARIANT_METADATA_BYTES,
1713 ]));
1714 let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello" as &[u8]]));
1715
1716 let struct_array = StructArrayBuilder::new()
1717 .with_field("metadata", metadata, false)
1718 .with_field("typed_value", typed_value, true)
1719 .build();
1720
1721 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1722 assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8]));
1723 }
1724
1725 #[test]
1726 fn large_binary_typed_value_roundtrips() {
1727 let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([
1729 EMPTY_VARIANT_METADATA_BYTES,
1730 ]));
1731 let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]]));
1732
1733 let struct_array = StructArrayBuilder::new()
1734 .with_field("metadata", metadata, false)
1735 .with_field("typed_value", typed_value, true)
1736 .build();
1737
1738 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1739 assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8]));
1740 }
1741
1742 macro_rules! invalid_variant_array_test {
1743 ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1744 #[test]
1745 fn $fn_name() {
1746 let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1747 EMPTY_VARIANT_METADATA_BYTES,
1748 1,
1749 ));
1750 let invalid_typed_value = $invalid_typed_value;
1751
1752 let struct_array = StructArrayBuilder::new()
1753 .with_field("metadata", Arc::new(metadata), false)
1754 .with_field("typed_value", Arc::new(invalid_typed_value), true)
1755 .build();
1756
1757 let array: VariantArray = VariantArray::try_new(&struct_array)
1758 .expect("should create variant array")
1759 .into();
1760
1761 let result = array.try_value(0);
1762 assert!(result.is_err());
1763 let error = result.unwrap_err();
1764 assert!(matches!(error, ArrowError::CastError(_)));
1765
1766 let expected: &str = $error_msg;
1767 assert!(
1768 error.to_string().contains($error_msg),
1769 "error `{}` did not contain `{}`",
1770 error,
1771 expected
1772 )
1773 }
1774 };
1775 }
1776
1777 invalid_variant_array_test!(
1778 test_variant_array_invalide_time,
1779 Time64MicrosecondArray::from(vec![Some(86401000000)]),
1780 "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1781 );
1782
1783 invalid_variant_array_test!(
1784 test_variant_array_invalid_decimal32,
1785 Decimal32Array::from(vec![Some(1234567890)]),
1786 "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1787 );
1788
1789 invalid_variant_array_test!(
1790 test_variant_array_invalid_decimal64,
1791 Decimal64Array::from(vec![Some(1234567890123456789)]),
1792 "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1793 );
1794
1795 invalid_variant_array_test!(
1796 test_variant_array_invalid_decimal128,
1797 Decimal128Array::from(vec![Some(
1798 i128::from_str("123456789012345678901234567890123456789").unwrap()
1799 ),]),
1800 "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1801 );
1802}