1use crate::VariantArrayBuilder;
21use crate::type_conversion::{
22 generic_conversion_single_value, generic_conversion_single_value_with_result,
23 primitive_conversion_single_value,
24};
25use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
26use arrow::buffer::NullBuffer;
27use arrow::compute::cast;
28use arrow::datatypes::{
29 Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
30 Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
31 TimestampMicrosecondType, TimestampNanosecondType,
32};
33use arrow::error::Result;
34use arrow_schema::extension::ExtensionType;
35use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
36use chrono::{DateTime, NaiveTime};
37use parquet_variant::{
38 Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
39};
40
41use std::borrow::Cow;
42use std::sync::Arc;
43
44pub struct VariantType;
49
50impl ExtensionType for VariantType {
51 const NAME: &'static str = "arrow.parquet.variant";
52
53 type Metadata = &'static str;
56
57 fn metadata(&self) -> &Self::Metadata {
58 &""
59 }
60
61 fn serialize_metadata(&self) -> Option<String> {
62 Some(String::new())
63 }
64
65 fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
66 Ok("")
67 }
68
69 fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
70 if matches!(data_type, DataType::Struct(_)) {
71 Ok(())
72 } else {
73 Err(ArrowError::InvalidArgumentError(format!(
74 "VariantType only supports StructArray, got {data_type}"
75 )))
76 }
77 }
78
79 fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
80 Self.supports_data_type(data_type)?;
81 Ok(Self)
82 }
83}
84
85#[derive(Debug, Clone, PartialEq)]
217pub struct VariantArray {
218 inner: StructArray,
220
221 metadata: BinaryViewArray,
223
224 shredding_state: ShreddingState,
226}
227
228impl VariantArray {
229 pub fn try_new(inner: &dyn Array) -> Result<Self> {
257 let inner = cast_to_binary_view_arrays(inner)?;
260
261 let Some(inner) = inner.as_struct_opt() else {
262 return Err(ArrowError::InvalidArgumentError(
263 "Invalid VariantArray: requires StructArray as input".to_string(),
264 ));
265 };
266
267 let Some(metadata_field) = inner.column_by_name("metadata") else {
271 return Err(ArrowError::InvalidArgumentError(
272 "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
273 ));
274 };
275 let Some(metadata) = metadata_field.as_binary_view_opt() else {
276 return Err(ArrowError::NotYetImplemented(format!(
277 "VariantArray 'metadata' field must be BinaryView, got {}",
278 metadata_field.data_type()
279 )));
280 };
281
282 Ok(Self {
284 inner: inner.clone(),
285 metadata: metadata.clone(),
286 shredding_state: ShreddingState::try_from(inner)?,
287 })
288 }
289
290 pub(crate) fn from_parts(
291 metadata: BinaryViewArray,
292 value: Option<BinaryViewArray>,
293 typed_value: Option<ArrayRef>,
294 nulls: Option<NullBuffer>,
295 ) -> Self {
296 let mut builder =
297 StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false);
298 if let Some(value) = value.clone() {
299 builder = builder.with_field("value", Arc::new(value), true);
300 }
301 if let Some(typed_value) = typed_value.clone() {
302 builder = builder.with_field("typed_value", typed_value, true);
303 }
304 if let Some(nulls) = nulls {
305 builder = builder.with_nulls(nulls);
306 }
307
308 Self {
309 inner: builder.build(),
310 metadata,
311 shredding_state: ShreddingState::new(value, typed_value),
312 }
313 }
314
315 pub fn inner(&self) -> &StructArray {
317 &self.inner
318 }
319
320 pub fn into_inner(self) -> StructArray {
322 self.inner
323 }
324
325 pub fn shredding_state(&self) -> &ShreddingState {
327 &self.shredding_state
328 }
329
330 pub fn value(&self, index: usize) -> Variant<'_, '_> {
340 self.try_value(index).unwrap()
341 }
342
343 pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
371 match (self.typed_value_field(), self.value_field()) {
372 (Some(typed_value), value) if typed_value.is_valid(index) => {
374 typed_value_to_variant(typed_value, value, index)
375 }
376 (_, Some(value)) if value.is_valid(index) => {
378 Ok(Variant::new(self.metadata.value(index), value.value(index)))
379 }
380 _ => Ok(Variant::Null),
383 }
384 }
385
386 pub fn metadata_field(&self) -> &BinaryViewArray {
388 &self.metadata
389 }
390
391 pub fn value_field(&self) -> Option<&BinaryViewArray> {
393 self.shredding_state.value_field()
394 }
395
396 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
398 self.shredding_state.typed_value_field()
399 }
400
401 pub fn field(&self, name: impl Into<String>) -> Field {
404 Field::new(
405 name.into(),
406 self.data_type().clone(),
407 self.inner.is_nullable(),
408 )
409 .with_extension_type(VariantType)
410 }
411
412 pub fn data_type(&self) -> &DataType {
414 self.inner.data_type()
415 }
416
417 pub fn slice(&self, offset: usize, length: usize) -> Self {
418 let inner = self.inner.slice(offset, length);
419 let metadata = self.metadata.slice(offset, length);
420 let shredding_state = self.shredding_state.slice(offset, length);
421 Self {
422 inner,
423 metadata,
424 shredding_state,
425 }
426 }
427
428 pub fn len(&self) -> usize {
429 self.inner.len()
430 }
431
432 pub fn is_empty(&self) -> bool {
433 self.inner.is_empty()
434 }
435
436 pub fn nulls(&self) -> Option<&NullBuffer> {
437 self.inner.nulls()
438 }
439
440 pub fn is_null(&self, index: usize) -> bool {
442 self.nulls().is_some_and(|n| n.is_null(index))
443 }
444
445 pub fn is_valid(&self, index: usize) -> bool {
447 !self.is_null(index)
448 }
449
450 pub fn iter(&self) -> VariantArrayIter<'_> {
452 VariantArrayIter::new(self)
453 }
454}
455
456impl From<VariantArray> for StructArray {
457 fn from(variant_array: VariantArray) -> Self {
458 variant_array.into_inner()
459 }
460}
461
462impl From<VariantArray> for ArrayRef {
463 fn from(variant_array: VariantArray) -> Self {
464 Arc::new(variant_array.into_inner())
465 }
466}
467
468impl<'m, 'v> FromIterator<Option<Variant<'m, 'v>>> for VariantArray {
469 fn from_iter<T: IntoIterator<Item = Option<Variant<'m, 'v>>>>(iter: T) -> Self {
470 let iter = iter.into_iter();
471
472 let mut b = VariantArrayBuilder::new(iter.size_hint().0);
473 b.extend(iter);
474 b.build()
475 }
476}
477
478impl<'m, 'v> FromIterator<Variant<'m, 'v>> for VariantArray {
479 fn from_iter<T: IntoIterator<Item = Variant<'m, 'v>>>(iter: T) -> Self {
480 Self::from_iter(iter.into_iter().map(Some))
481 }
482}
483
484#[derive(Debug)]
509pub struct VariantArrayIter<'a> {
510 array: &'a VariantArray,
511 head_i: usize,
512 tail_i: usize,
513}
514
515impl<'a> VariantArrayIter<'a> {
516 pub fn new(array: &'a VariantArray) -> Self {
518 Self {
519 array,
520 head_i: 0,
521 tail_i: array.len(),
522 }
523 }
524
525 fn value_opt(&self, i: usize) -> Option<Variant<'a, 'a>> {
526 self.array.is_valid(i).then(|| self.array.value(i))
527 }
528}
529
530impl<'a> Iterator for VariantArrayIter<'a> {
531 type Item = Option<Variant<'a, 'a>>;
532
533 #[inline]
534 fn next(&mut self) -> Option<Self::Item> {
535 if self.head_i == self.tail_i {
536 return None;
537 }
538
539 let out = self.value_opt(self.head_i);
540
541 self.head_i += 1;
542
543 Some(out)
544 }
545
546 fn size_hint(&self) -> (usize, Option<usize>) {
547 let remainder = self.tail_i - self.head_i;
548
549 (remainder, Some(remainder))
550 }
551}
552
553impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {
554 fn next_back(&mut self) -> Option<Self::Item> {
555 if self.head_i == self.tail_i {
556 return None;
557 }
558
559 self.tail_i -= 1;
560
561 Some(self.value_opt(self.tail_i))
562 }
563}
564
565impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}
566
567#[derive(Debug)]
602pub struct ShreddedVariantFieldArray {
603 inner: StructArray,
605 shredding_state: ShreddingState,
606}
607
608#[allow(unused)]
609impl ShreddedVariantFieldArray {
610 pub fn try_new(inner: &dyn Array) -> Result<Self> {
631 let Some(inner_struct) = inner.as_struct_opt() else {
632 return Err(ArrowError::InvalidArgumentError(
633 "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
634 ));
635 };
636
637 Ok(Self {
639 inner: inner_struct.clone(),
640 shredding_state: ShreddingState::try_from(inner_struct)?,
641 })
642 }
643
644 pub fn shredding_state(&self) -> &ShreddingState {
646 &self.shredding_state
647 }
648
649 pub fn value_field(&self) -> Option<&BinaryViewArray> {
651 self.shredding_state.value_field()
652 }
653
654 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
656 self.shredding_state.typed_value_field()
657 }
658
659 pub fn inner(&self) -> &StructArray {
661 &self.inner
662 }
663
664 pub(crate) fn from_parts(
665 value: Option<BinaryViewArray>,
666 typed_value: Option<ArrayRef>,
667 nulls: Option<NullBuffer>,
668 ) -> Self {
669 let mut builder = StructArrayBuilder::new();
670 if let Some(value) = value.clone() {
671 builder = builder.with_field("value", Arc::new(value), true);
672 }
673 if let Some(typed_value) = typed_value.clone() {
674 builder = builder.with_field("typed_value", typed_value, true);
675 }
676 if let Some(nulls) = nulls {
677 builder = builder.with_nulls(nulls);
678 }
679
680 Self {
681 inner: builder.build(),
682 shredding_state: ShreddingState::new(value, typed_value),
683 }
684 }
685
686 pub fn into_inner(self) -> StructArray {
688 self.inner
689 }
690
691 pub fn data_type(&self) -> &DataType {
692 self.inner.data_type()
693 }
694
695 pub fn len(&self) -> usize {
696 self.inner.len()
697 }
698
699 pub fn is_empty(&self) -> bool {
700 self.inner.is_empty()
701 }
702
703 pub fn offset(&self) -> usize {
704 self.inner.offset()
705 }
706
707 pub fn nulls(&self) -> Option<&NullBuffer> {
708 None
712 }
713 pub fn is_null(&self, index: usize) -> bool {
715 self.nulls().is_some_and(|n| n.is_null(index))
716 }
717
718 pub fn is_valid(&self, index: usize) -> bool {
720 !self.is_null(index)
721 }
722}
723
724impl From<ShreddedVariantFieldArray> for ArrayRef {
725 fn from(array: ShreddedVariantFieldArray) -> Self {
726 Arc::new(array.into_inner())
727 }
728}
729
730impl From<ShreddedVariantFieldArray> for StructArray {
731 fn from(array: ShreddedVariantFieldArray) -> Self {
732 array.into_inner()
733 }
734}
735
736#[derive(Debug, Clone, PartialEq)]
770pub struct ShreddingState {
771 value: Option<BinaryViewArray>,
772 typed_value: Option<ArrayRef>,
773}
774
775impl ShreddingState {
776 pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
791 Self { value, typed_value }
792 }
793
794 pub fn value_field(&self) -> Option<&BinaryViewArray> {
796 self.value.as_ref()
797 }
798
799 pub fn typed_value_field(&self) -> Option<&ArrayRef> {
801 self.typed_value.as_ref()
802 }
803
804 pub fn borrow(&self) -> BorrowedShreddingState<'_> {
806 BorrowedShreddingState {
807 value: self.value_field(),
808 typed_value: self.typed_value_field(),
809 }
810 }
811
812 pub fn slice(&self, offset: usize, length: usize) -> Self {
814 Self {
815 value: self.value.as_ref().map(|v| v.slice(offset, length)),
816 typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
817 }
818 }
819}
820
821#[derive(Clone, Debug)]
824pub struct BorrowedShreddingState<'a> {
825 value: Option<&'a BinaryViewArray>,
826 typed_value: Option<&'a ArrayRef>,
827}
828
829impl<'a> BorrowedShreddingState<'a> {
830 pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
845 Self { value, typed_value }
846 }
847
848 pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
850 self.value
851 }
852
853 pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
855 self.typed_value
856 }
857}
858
859impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
860 type Error = ArrowError;
861
862 fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
863 let value = if let Some(value_col) = inner_struct.column_by_name("value") {
865 let Some(binary_view) = value_col.as_binary_view_opt() else {
866 return Err(ArrowError::NotYetImplemented(format!(
867 "VariantArray 'value' field must be BinaryView, got {}",
868 value_col.data_type()
869 )));
870 };
871 Some(binary_view)
872 } else {
873 None
874 };
875 let typed_value = inner_struct.column_by_name("typed_value");
876 Ok(BorrowedShreddingState::new(value, typed_value))
877 }
878}
879
880impl TryFrom<&StructArray> for ShreddingState {
881 type Error = ArrowError;
882
883 fn try_from(inner_struct: &StructArray) -> Result<Self> {
884 Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
885 }
886}
887
888impl From<BorrowedShreddingState<'_>> for ShreddingState {
889 fn from(state: BorrowedShreddingState<'_>) -> Self {
890 ShreddingState {
891 value: state.value_field().cloned(),
892 typed_value: state.typed_value_field().cloned(),
893 }
894 }
895}
896
897#[derive(Debug, Default, Clone)]
901pub(crate) struct StructArrayBuilder {
902 fields: Vec<FieldRef>,
903 arrays: Vec<ArrayRef>,
904 nulls: Option<NullBuffer>,
905}
906
907impl StructArrayBuilder {
908 pub fn new() -> Self {
909 Default::default()
910 }
911
912 pub fn with_field(mut self, field_name: &str, array: ArrayRef, nullable: bool) -> Self {
914 let field = Field::new(field_name, array.data_type().clone(), nullable);
915 self.fields.push(Arc::new(field));
916 self.arrays.push(array);
917 self
918 }
919
920 pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
922 self.nulls = Some(nulls);
923 self
924 }
925
926 pub fn build(self) -> StructArray {
927 let Self {
928 fields,
929 arrays,
930 nulls,
931 } = self;
932 StructArray::new(Fields::from(fields), arrays, nulls)
933 }
934}
935
936fn typed_value_to_variant<'a>(
938 typed_value: &'a ArrayRef,
939 value: Option<&BinaryViewArray>,
940 index: usize,
941) -> Result<Variant<'a, 'a>> {
942 let data_type = typed_value.data_type();
943 if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
944 panic!("Invalid variant, conflicting value and typed_value");
946 }
947 match data_type {
948 DataType::Null => Ok(Variant::Null),
949 DataType::Boolean => {
950 let boolean_array = typed_value.as_boolean();
951 let value = boolean_array.value(index);
952 Ok(Variant::from(value))
953 }
954 DataType::FixedSizeBinary(16) => {
956 let array = typed_value.as_fixed_size_binary();
957 let value = array.value(index);
958 Ok(Uuid::from_slice(value).unwrap().into()) }
960 DataType::BinaryView => {
961 let array = typed_value.as_binary_view();
962 let value = array.value(index);
963 Ok(Variant::from(value))
964 }
965 DataType::Utf8 => {
966 let array = typed_value.as_string::<i32>();
967 let value = array.value(index);
968 Ok(Variant::from(value))
969 }
970 DataType::LargeUtf8 => {
971 let array = typed_value.as_string::<i64>();
972 let value = array.value(index);
973 Ok(Variant::from(value))
974 }
975 DataType::Utf8View => {
976 let array = typed_value.as_string_view();
977 let value = array.value(index);
978 Ok(Variant::from(value))
979 }
980 DataType::Int8 => {
981 primitive_conversion_single_value!(Int8Type, typed_value, index)
982 }
983 DataType::Int16 => {
984 primitive_conversion_single_value!(Int16Type, typed_value, index)
985 }
986 DataType::Int32 => {
987 primitive_conversion_single_value!(Int32Type, typed_value, index)
988 }
989 DataType::Int64 => {
990 primitive_conversion_single_value!(Int64Type, typed_value, index)
991 }
992 DataType::Float16 => {
993 primitive_conversion_single_value!(Float16Type, typed_value, index)
994 }
995 DataType::Float32 => {
996 primitive_conversion_single_value!(Float32Type, typed_value, index)
997 }
998 DataType::Float64 => {
999 primitive_conversion_single_value!(Float64Type, typed_value, index)
1000 }
1001 DataType::Decimal32(_, s) => {
1002 generic_conversion_single_value_with_result!(
1003 Decimal32Type,
1004 as_primitive,
1005 |v| VariantDecimal4::try_new(v, *s as u8),
1006 typed_value,
1007 index
1008 )
1009 }
1010 DataType::Decimal64(_, s) => {
1011 generic_conversion_single_value_with_result!(
1012 Decimal64Type,
1013 as_primitive,
1014 |v| VariantDecimal8::try_new(v, *s as u8),
1015 typed_value,
1016 index
1017 )
1018 }
1019 DataType::Decimal128(_, s) => {
1020 generic_conversion_single_value_with_result!(
1021 Decimal128Type,
1022 as_primitive,
1023 |v| VariantDecimal16::try_new(v, *s as u8),
1024 typed_value,
1025 index
1026 )
1027 }
1028 DataType::Date32 => {
1029 generic_conversion_single_value!(
1030 Date32Type,
1031 as_primitive,
1032 |v| Date32Type::to_naive_date_opt(v).unwrap(),
1033 typed_value,
1034 index
1035 )
1036 }
1037 DataType::Time64(TimeUnit::Microsecond) => {
1038 generic_conversion_single_value_with_result!(
1039 Time64MicrosecondType,
1040 as_primitive,
1041 |v| NaiveTime::from_num_seconds_from_midnight_opt(
1042 (v / 1_000_000) as u32,
1043 (v % 1_000_000) as u32 * 1000
1044 )
1045 .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
1046 typed_value,
1047 index
1048 )
1049 }
1050 DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
1051 generic_conversion_single_value!(
1052 TimestampMicrosecondType,
1053 as_primitive,
1054 |v| DateTime::from_timestamp_micros(v).unwrap(),
1055 typed_value,
1056 index
1057 )
1058 }
1059 DataType::Timestamp(TimeUnit::Microsecond, None) => {
1060 generic_conversion_single_value!(
1061 TimestampMicrosecondType,
1062 as_primitive,
1063 |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
1064 typed_value,
1065 index
1066 )
1067 }
1068 DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
1069 generic_conversion_single_value!(
1070 TimestampNanosecondType,
1071 as_primitive,
1072 DateTime::from_timestamp_nanos,
1073 typed_value,
1074 index
1075 )
1076 }
1077 DataType::Timestamp(TimeUnit::Nanosecond, None) => {
1078 generic_conversion_single_value!(
1079 TimestampNanosecondType,
1080 as_primitive,
1081 |v| DateTime::from_timestamp_nanos(v).naive_utc(),
1082 typed_value,
1083 index
1084 )
1085 }
1086 _ => {
1089 debug_assert!(
1093 false,
1094 "Unsupported typed_value type: {}",
1095 typed_value.data_type()
1096 );
1097 Ok(Variant::Null)
1098 }
1099 }
1100}
1101
1102fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
1113 let new_type = canonicalize_and_verify_data_type(array.data_type())?;
1114 if let Cow::Borrowed(_) = new_type {
1115 if let Some(array) = array.as_struct_opt() {
1116 return Ok(Arc::new(array.clone())); }
1118 }
1119 cast(array, new_type.as_ref())
1120}
1121
1122fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
1126 use DataType::*;
1127
1128 macro_rules! fail {
1130 () => {
1131 return Err(ArrowError::InvalidArgumentError(format!(
1132 "Illegal shredded value type: {data_type}"
1133 )))
1134 };
1135 }
1136 macro_rules! borrow {
1137 () => {
1138 Cow::Borrowed(data_type)
1139 };
1140 }
1141
1142 let new_data_type = match data_type {
1143 Null | Boolean => borrow!(),
1145 Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
1146
1147 UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
1149
1150 Decimal64(p, s) | Decimal128(p, s)
1155 if VariantDecimal4::is_valid_precision_and_scale(p, s) =>
1156 {
1157 Cow::Owned(Decimal32(*p, *s))
1158 }
1159 Decimal128(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => {
1160 Cow::Owned(Decimal64(*p, *s))
1161 }
1162 Decimal32(p, s) if VariantDecimal4::is_valid_precision_and_scale(p, s) => borrow!(),
1163 Decimal64(p, s) if VariantDecimal8::is_valid_precision_and_scale(p, s) => borrow!(),
1164 Decimal128(p, s) if VariantDecimal16::is_valid_precision_and_scale(p, s) => borrow!(),
1165 Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(),
1166
1167 Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(),
1169 Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
1170
1171 Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
1173 Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
1174
1175 Binary | LargeBinary => Cow::Owned(BinaryView),
1178 BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
1179
1180 FixedSizeBinary(16) => borrow!(),
1182 FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
1183
1184 List(field) => match canonicalize_and_verify_field(field)? {
1186 Cow::Borrowed(_) => borrow!(),
1187 Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
1188 },
1189 LargeList(field) => match canonicalize_and_verify_field(field)? {
1190 Cow::Borrowed(_) => borrow!(),
1191 Cow::Owned(new_field) => Cow::Owned(DataType::LargeList(new_field)),
1192 },
1193 ListView(field) => match canonicalize_and_verify_field(field)? {
1194 Cow::Borrowed(_) => borrow!(),
1195 Cow::Owned(new_field) => Cow::Owned(DataType::ListView(new_field)),
1196 },
1197 LargeListView(field) => match canonicalize_and_verify_field(field)? {
1198 Cow::Borrowed(_) => borrow!(),
1199 Cow::Owned(new_field) => Cow::Owned(DataType::LargeListView(new_field)),
1200 },
1201 Struct(fields) => {
1203 let mut new_fields = std::collections::HashMap::new();
1206 for (i, field) in fields.iter().enumerate() {
1207 if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? {
1208 new_fields.insert(i, new_field);
1209 }
1210 }
1211
1212 if new_fields.is_empty() {
1213 borrow!()
1214 } else {
1215 let new_fields = fields
1216 .iter()
1217 .enumerate()
1218 .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone()));
1219 Cow::Owned(DataType::Struct(new_fields.collect()))
1220 }
1221 }
1222 Map(..) | Union(..) => fail!(),
1223
1224 Dictionary(..) | RunEndEncoded(..) => fail!(),
1226 };
1227 Ok(new_data_type)
1228}
1229
1230fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
1231 let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
1232 return Ok(Cow::Borrowed(field));
1233 };
1234 let new_field = field.as_ref().clone().with_data_type(new_data_type);
1235 Ok(Cow::Owned(Arc::new(new_field)))
1236}
1237
1238#[cfg(test)]
1239mod test {
1240 use crate::VariantArrayBuilder;
1241 use std::str::FromStr;
1242
1243 use super::*;
1244 use arrow::array::{
1245 BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array, Int64Array,
1246 LargeListArray, LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray,
1247 };
1248 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
1249 use arrow_schema::{Field, Fields};
1250 use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
1251
1252 #[test]
1253 fn invalid_not_a_struct_array() {
1254 let array = make_binary_view_array();
1255 let err = VariantArray::try_new(&array);
1257 assert_eq!(
1258 err.unwrap_err().to_string(),
1259 "Invalid argument error: Invalid VariantArray: requires StructArray as input"
1260 );
1261 }
1262
1263 #[test]
1264 fn invalid_missing_metadata() {
1265 let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]);
1266 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1267 let err = VariantArray::try_new(&array);
1269 assert_eq!(
1270 err.unwrap_err().to_string(),
1271 "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field"
1272 );
1273 }
1274
1275 #[test]
1276 fn all_null_missing_value_and_typed_value() {
1277 let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1278 let array = StructArray::new(fields, vec![make_binary_view_array()], None);
1279
1280 let variant_array = VariantArray::try_new(&array).unwrap();
1284
1285 assert!(matches!(
1287 variant_array.shredding_state(),
1288 ShreddingState {
1289 value: None,
1290 typed_value: None
1291 }
1292 ));
1293
1294 for i in 0..variant_array.len() {
1296 if variant_array.is_valid(i) {
1297 assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
1298 }
1299 }
1300 }
1301
1302 #[test]
1303 fn invalid_metadata_field_type() {
1304 let fields = Fields::from(vec![
1305 Field::new("metadata", DataType::Int32, true), Field::new("value", DataType::BinaryView, true),
1307 ]);
1308 let array = StructArray::new(
1309 fields,
1310 vec![make_int32_array(), make_binary_view_array()],
1311 None,
1312 );
1313 let err = VariantArray::try_new(&array);
1314 assert_eq!(
1315 err.unwrap_err().to_string(),
1316 "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32"
1317 );
1318 }
1319
1320 #[test]
1321 fn invalid_value_field_type() {
1322 let fields = Fields::from(vec![
1323 Field::new("metadata", DataType::BinaryView, true),
1324 Field::new("value", DataType::Int32, true), ]);
1326 let array = StructArray::new(
1327 fields,
1328 vec![make_binary_view_array(), make_int32_array()],
1329 None,
1330 );
1331 let err = VariantArray::try_new(&array);
1332 assert_eq!(
1333 err.unwrap_err().to_string(),
1334 "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32"
1335 );
1336 }
1337
1338 fn make_binary_view_array() -> ArrayRef {
1339 Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]]))
1340 }
1341
1342 fn make_int32_array() -> ArrayRef {
1343 Arc::new(Int32Array::from(vec![1]))
1344 }
1345
1346 fn make_variant_struct_with_typed_value(typed_value: ArrayRef) -> StructArray {
1347 let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1348 EMPTY_VARIANT_METADATA_BYTES,
1349 typed_value.len(),
1350 ));
1351 StructArrayBuilder::new()
1352 .with_field("metadata", Arc::new(metadata), false)
1353 .with_field("typed_value", typed_value, true)
1354 .build()
1355 }
1356
1357 #[test]
1358 fn all_null_shredding_state() {
1359 assert!(matches!(
1361 ShreddingState::new(None, None),
1362 ShreddingState {
1363 value: None,
1364 typed_value: None
1365 }
1366 ));
1367 }
1368
1369 #[test]
1370 fn all_null_variant_array_construction() {
1371 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1372 let nulls = NullBuffer::from(vec![false, false, false]); let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
1375 let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));
1376
1377 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1378
1379 assert!(matches!(
1381 variant_array.shredding_state(),
1382 ShreddingState {
1383 value: None,
1384 typed_value: None
1385 }
1386 ));
1387
1388 assert_eq!(variant_array.len(), 3);
1390 assert!(!variant_array.is_valid(0));
1391 assert!(!variant_array.is_valid(1));
1392 assert!(!variant_array.is_valid(2));
1393
1394 for i in 0..variant_array.len() {
1396 assert!(
1397 !variant_array.is_valid(i),
1398 "Expected value at index {i} to be null"
1399 );
1400 }
1401 }
1402
1403 #[test]
1404 fn value_field_present_but_all_null_should_be_unshredded() {
1405 let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
1408
1409 let value_nulls = NullBuffer::from(vec![false, false, false]); let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
1412 let value_data = value_array
1413 .to_data()
1414 .into_builder()
1415 .nulls(Some(value_nulls))
1416 .build()
1417 .unwrap();
1418 let value = BinaryViewArray::from(value_data);
1419
1420 let fields = Fields::from(vec![
1421 Field::new("metadata", DataType::BinaryView, false),
1422 Field::new("value", DataType::BinaryView, true), ]);
1424 let struct_array = StructArray::new(
1425 fields,
1426 vec![Arc::new(metadata), Arc::new(value)],
1427 None, );
1429
1430 let variant_array = VariantArray::try_new(&struct_array).unwrap();
1431
1432 assert!(matches!(
1434 variant_array.shredding_state(),
1435 ShreddingState {
1436 value: Some(_),
1437 typed_value: None
1438 }
1439 ));
1440 }
1441
1442 #[test]
1443 fn canonicalize_and_verify_list_like_data_types() {
1444 let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true));
1448 let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true));
1449
1450 let cases = vec![
1451 (
1452 DataType::LargeList(make_item_binary()),
1453 DataType::LargeList(make_item_binary_view()),
1454 ),
1455 (
1456 DataType::ListView(make_item_binary()),
1457 DataType::ListView(make_item_binary_view()),
1458 ),
1459 (
1460 DataType::LargeListView(make_item_binary()),
1461 DataType::LargeListView(make_item_binary_view()),
1462 ),
1463 ];
1464
1465 for (input, expected) in cases {
1466 assert_eq!(
1467 canonicalize_and_verify_data_type(&input).unwrap().as_ref(),
1468 &expected
1469 );
1470 }
1471 }
1472
1473 #[test]
1474 fn variant_array_try_new_supports_list_like_typed_value() {
1475 let item_field = Arc::new(Field::new("item", DataType::Int64, true));
1476 let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)]));
1477
1478 let typed_values = vec![
1479 Arc::new(ListArray::new(
1480 item_field.clone(),
1481 OffsetBuffer::new(ScalarBuffer::from(vec![0, 2, 3])),
1482 values.clone(),
1483 None,
1484 )) as ArrayRef,
1485 Arc::new(LargeListArray::new(
1486 item_field.clone(),
1487 OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 3])),
1488 values.clone(),
1489 None,
1490 )) as ArrayRef,
1491 Arc::new(ListViewArray::new(
1492 item_field.clone(),
1493 ScalarBuffer::from(vec![0, 2]),
1494 ScalarBuffer::from(vec![2, 1]),
1495 values.clone(),
1496 None,
1497 )) as ArrayRef,
1498 Arc::new(LargeListViewArray::new(
1499 item_field,
1500 ScalarBuffer::from(vec![0_i64, 2]),
1501 ScalarBuffer::from(vec![2_i64, 1]),
1502 values,
1503 None,
1504 )) as ArrayRef,
1505 ];
1506
1507 for typed_value in typed_values {
1508 let input = make_variant_struct_with_typed_value(typed_value.clone());
1509 let variant_array = VariantArray::try_new(&input).unwrap();
1510 assert_eq!(
1511 variant_array.typed_value_field().unwrap().data_type(),
1512 typed_value.data_type(),
1513 );
1514 }
1515 }
1516
1517 #[test]
1518 fn test_variant_array_iterable() {
1519 let mut b = VariantArrayBuilder::new(6);
1520
1521 b.append_null();
1522 b.append_variant(Variant::from(1_i8));
1523 b.append_variant(Variant::Null);
1524 b.append_variant(Variant::from(2_i32));
1525 b.append_variant(Variant::from(3_i64));
1526 b.append_null();
1527
1528 let v = b.build();
1529
1530 let variants = v.iter().collect::<Vec<_>>();
1531
1532 assert_eq!(
1533 variants,
1534 vec![
1535 None,
1536 Some(Variant::Int8(1)),
1537 Some(Variant::Null),
1538 Some(Variant::Int32(2)),
1539 Some(Variant::Int64(3)),
1540 None,
1541 ]
1542 );
1543 }
1544
1545 #[test]
1546 fn test_variant_array_iter_double_ended() {
1547 let mut b = VariantArrayBuilder::new(5);
1548
1549 b.append_variant(Variant::from(0_i32));
1550 b.append_null();
1551 b.append_variant(Variant::from(2_i32));
1552 b.append_null();
1553 b.append_variant(Variant::from(4_i32));
1554
1555 let array = b.build();
1556 let mut iter = array.iter();
1557
1558 assert_eq!(iter.next(), Some(Some(Variant::from(0_i32))));
1559 assert_eq!(iter.next(), Some(None));
1560
1561 assert_eq!(iter.next_back(), Some(Some(Variant::from(4_i32))));
1562 assert_eq!(iter.next_back(), Some(None));
1563 assert_eq!(iter.next_back(), Some(Some(Variant::from(2_i32))));
1564
1565 assert_eq!(iter.next_back(), None);
1566 assert_eq!(iter.next(), None);
1567 }
1568
1569 #[test]
1570 fn test_variant_array_iter_reverse() {
1571 let mut b = VariantArrayBuilder::new(5);
1572
1573 b.append_variant(Variant::from("a"));
1574 b.append_null();
1575 b.append_variant(Variant::from("aaa"));
1576 b.append_null();
1577 b.append_variant(Variant::from("aaaaa"));
1578
1579 let array = b.build();
1580
1581 let result: Vec<_> = array.iter().rev().collect();
1582 assert_eq!(
1583 result,
1584 vec![
1585 Some(Variant::from("aaaaa")),
1586 None,
1587 Some(Variant::from("aaa")),
1588 None,
1589 Some(Variant::from("a")),
1590 ]
1591 );
1592 }
1593
1594 #[test]
1595 fn test_variant_array_iter_empty() {
1596 let v = VariantArrayBuilder::new(0).build();
1597 let mut i = v.iter();
1598 assert!(i.next().is_none());
1599 assert!(i.next_back().is_none());
1600 }
1601
1602 #[test]
1603 fn test_from_variant_opts_into_variant_array() {
1604 let v = vec![None, Some(Variant::Null), Some(Variant::BooleanFalse), None];
1605
1606 let variant_array = VariantArray::from_iter(v);
1607
1608 assert_eq!(variant_array.len(), 4);
1609
1610 assert!(variant_array.is_null(0));
1611
1612 assert!(!variant_array.is_null(1));
1613 assert_eq!(variant_array.value(1), Variant::Null);
1614
1615 assert!(!variant_array.is_null(2));
1616 assert_eq!(variant_array.value(2), Variant::BooleanFalse);
1617
1618 assert!(variant_array.is_null(3));
1619 }
1620
1621 #[test]
1622 fn test_from_variants_into_variant_array() {
1623 let v = vec![
1624 Variant::Null,
1625 Variant::BooleanFalse,
1626 Variant::ShortString(ShortString::try_new("norm").unwrap()),
1627 ];
1628
1629 let variant_array = VariantArray::from_iter(v);
1630
1631 assert_eq!(variant_array.len(), 3);
1632
1633 assert!(!variant_array.is_null(0));
1634 assert_eq!(variant_array.value(0), Variant::Null);
1635
1636 assert!(!variant_array.is_null(1));
1637 assert_eq!(variant_array.value(1), Variant::BooleanFalse);
1638
1639 assert!(!variant_array.is_null(2));
1640 assert_eq!(
1641 variant_array.value(2),
1642 Variant::ShortString(ShortString::try_new("norm").unwrap())
1643 );
1644 }
1645
1646 #[test]
1647 fn test_variant_equality() {
1648 let v_iter = [None, Some(Variant::BooleanFalse), Some(Variant::Null), None];
1649 let v = VariantArray::from_iter(v_iter.clone());
1650
1651 {
1652 let v_copy = v.clone();
1653 assert_eq!(v, v_copy);
1654 }
1655
1656 {
1657 let v_iter_reversed = v_iter.iter().cloned().rev();
1658 let v_reversed = VariantArray::from_iter(v_iter_reversed);
1659
1660 assert_ne!(v, v_reversed);
1661 }
1662
1663 {
1664 let v_sliced = v.slice(0, 1);
1665 assert_ne!(v, v_sliced);
1666 }
1667 }
1668
1669 macro_rules! invalid_variant_array_test {
1670 ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1671 #[test]
1672 fn $fn_name() {
1673 let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1674 EMPTY_VARIANT_METADATA_BYTES,
1675 1,
1676 ));
1677 let invalid_typed_value = $invalid_typed_value;
1678
1679 let struct_array = StructArrayBuilder::new()
1680 .with_field("metadata", Arc::new(metadata), false)
1681 .with_field("typed_value", Arc::new(invalid_typed_value), true)
1682 .build();
1683
1684 let array: VariantArray = VariantArray::try_new(&struct_array)
1685 .expect("should create variant array")
1686 .into();
1687
1688 let result = array.try_value(0);
1689 assert!(result.is_err());
1690 let error = result.unwrap_err();
1691 assert!(matches!(error, ArrowError::CastError(_)));
1692
1693 let expected: &str = $error_msg;
1694 assert!(
1695 error.to_string().contains($error_msg),
1696 "error `{}` did not contain `{}`",
1697 error,
1698 expected
1699 )
1700 }
1701 };
1702 }
1703
1704 invalid_variant_array_test!(
1705 test_variant_array_invalide_time,
1706 Time64MicrosecondArray::from(vec![Some(86401000000)]),
1707 "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1708 );
1709
1710 invalid_variant_array_test!(
1711 test_variant_array_invalid_decimal32,
1712 Decimal32Array::from(vec![Some(1234567890)]),
1713 "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1714 );
1715
1716 invalid_variant_array_test!(
1717 test_variant_array_invalid_decimal64,
1718 Decimal64Array::from(vec![Some(1234567890123456789)]),
1719 "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1720 );
1721
1722 invalid_variant_array_test!(
1723 test_variant_array_invalid_decimal128,
1724 Decimal128Array::from(vec![Some(
1725 i128::from_str("123456789012345678901234567890123456789").unwrap()
1726 ),]),
1727 "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1728 );
1729}