1use std::{collections::HashMap, fmt, sync::Arc};
21
22use crate::file::metadata::HeapSize;
23use crate::format::SchemaElement;
24
25use crate::basic::{
26 ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType,
27};
28use crate::errors::{ParquetError, Result};
29
30pub type TypePtr = Arc<Type>;
35pub type SchemaDescPtr = Arc<SchemaDescriptor>;
37pub type ColumnDescPtr = Arc<ColumnDescriptor>;
39
40#[derive(Clone, Debug, PartialEq)]
47pub enum Type {
48 PrimitiveType {
50 basic_info: BasicTypeInfo,
52 physical_type: PhysicalType,
54 type_length: i32,
56 scale: i32,
58 precision: i32,
60 },
61 GroupType {
63 basic_info: BasicTypeInfo,
65 fields: Vec<TypePtr>,
67 },
68}
69
70impl HeapSize for Type {
71 fn heap_size(&self) -> usize {
72 match self {
73 Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
74 Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
75 }
76 }
77}
78
79impl Type {
80 pub fn primitive_type_builder(
82 name: &str,
83 physical_type: PhysicalType,
84 ) -> PrimitiveTypeBuilder<'_> {
85 PrimitiveTypeBuilder::new(name, physical_type)
86 }
87
88 pub fn group_type_builder(name: &str) -> GroupTypeBuilder<'_> {
90 GroupTypeBuilder::new(name)
91 }
92
93 pub fn get_basic_info(&self) -> &BasicTypeInfo {
95 match *self {
96 Type::PrimitiveType { ref basic_info, .. } => basic_info,
97 Type::GroupType { ref basic_info, .. } => basic_info,
98 }
99 }
100
101 pub fn name(&self) -> &str {
103 self.get_basic_info().name()
104 }
105
106 pub fn get_fields(&self) -> &[TypePtr] {
110 match *self {
111 Type::GroupType { ref fields, .. } => &fields[..],
112 _ => panic!("Cannot call get_fields() on a non-group type"),
113 }
114 }
115
116 pub fn get_physical_type(&self) -> PhysicalType {
119 match *self {
120 Type::PrimitiveType {
121 basic_info: _,
122 physical_type,
123 ..
124 } => physical_type,
125 _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
126 }
127 }
128
129 pub fn get_precision(&self) -> i32 {
132 match *self {
133 Type::PrimitiveType { precision, .. } => precision,
134 _ => panic!("Cannot call get_precision() on non-primitive type"),
135 }
136 }
137
138 pub fn get_scale(&self) -> i32 {
141 match *self {
142 Type::PrimitiveType { scale, .. } => scale,
143 _ => panic!("Cannot call get_scale() on non-primitive type"),
144 }
145 }
146
147 pub fn check_contains(&self, sub_type: &Type) -> bool {
150 let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
152 && (self.is_schema() && sub_type.is_schema()
153 || !self.is_schema()
154 && !sub_type.is_schema()
155 && self.get_basic_info().repetition()
156 == sub_type.get_basic_info().repetition());
157
158 match *self {
159 Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
160 self.get_physical_type() == sub_type.get_physical_type()
161 }
162 Type::GroupType { .. } if basic_match && sub_type.is_group() => {
163 let mut field_map = HashMap::new();
165 for field in self.get_fields() {
166 field_map.insert(field.name(), field);
167 }
168
169 for field in sub_type.get_fields() {
170 if !field_map
171 .get(field.name())
172 .map(|tpe| tpe.check_contains(field))
173 .unwrap_or(false)
174 {
175 return false;
176 }
177 }
178 true
179 }
180 _ => false,
181 }
182 }
183
184 pub fn is_primitive(&self) -> bool {
186 matches!(*self, Type::PrimitiveType { .. })
187 }
188
189 pub fn is_group(&self) -> bool {
191 matches!(*self, Type::GroupType { .. })
192 }
193
194 pub fn is_schema(&self) -> bool {
196 match *self {
197 Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
198 _ => false,
199 }
200 }
201
202 pub fn is_optional(&self) -> bool {
205 self.get_basic_info().has_repetition()
206 && self.get_basic_info().repetition() != Repetition::REQUIRED
207 }
208
209 pub(crate) fn is_list(&self) -> bool {
211 if self.is_group() {
212 let basic_info = self.get_basic_info();
213 if let Some(logical_type) = basic_info.logical_type() {
214 return logical_type == LogicalType::List;
215 }
216 return basic_info.converted_type() == ConvertedType::LIST;
217 }
218 false
219 }
220
221 pub(crate) fn has_single_repeated_child(&self) -> bool {
223 if self.is_group() {
224 let children = self.get_fields();
225 return children.len() == 1
226 && children[0].get_basic_info().has_repetition()
227 && children[0].get_basic_info().repetition() == Repetition::REPEATED;
228 }
229 false
230 }
231}
232
233pub struct PrimitiveTypeBuilder<'a> {
237 name: &'a str,
238 repetition: Repetition,
239 physical_type: PhysicalType,
240 converted_type: ConvertedType,
241 logical_type: Option<LogicalType>,
242 length: i32,
243 precision: i32,
244 scale: i32,
245 id: Option<i32>,
246}
247
248impl<'a> PrimitiveTypeBuilder<'a> {
249 pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
251 Self {
252 name,
253 repetition: Repetition::OPTIONAL,
254 physical_type,
255 converted_type: ConvertedType::NONE,
256 logical_type: None,
257 length: -1,
258 precision: -1,
259 scale: -1,
260 id: None,
261 }
262 }
263
264 pub fn with_repetition(self, repetition: Repetition) -> Self {
266 Self { repetition, ..self }
267 }
268
269 pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
271 Self {
272 converted_type,
273 ..self
274 }
275 }
276
277 pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
281 Self {
282 logical_type,
283 ..self
284 }
285 }
286
287 pub fn with_length(self, length: i32) -> Self {
292 Self { length, ..self }
293 }
294
295 pub fn with_precision(self, precision: i32) -> Self {
298 Self { precision, ..self }
299 }
300
301 pub fn with_scale(self, scale: i32) -> Self {
304 Self { scale, ..self }
305 }
306
307 pub fn with_id(self, id: Option<i32>) -> Self {
309 Self { id, ..self }
310 }
311
312 pub fn build(self) -> Result<Type> {
315 let mut basic_info = BasicTypeInfo {
316 name: String::from(self.name),
317 repetition: Some(self.repetition),
318 converted_type: self.converted_type,
319 logical_type: self.logical_type.clone(),
320 id: self.id,
321 };
322
323 if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
325 return Err(general_err!(
326 "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
327 self.length,
328 self.name
329 ));
330 }
331
332 if let Some(logical_type) = &self.logical_type {
333 if self.converted_type != ConvertedType::NONE {
336 if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
337 return Err(general_err!(
338 "Logical type {:?} is incompatible with converted type {} for field '{}'",
339 logical_type,
340 self.converted_type,
341 self.name
342 ));
343 }
344 } else {
345 basic_info.converted_type = self.logical_type.clone().into();
347 }
348 match (logical_type, self.physical_type) {
350 (LogicalType::Map, _) | (LogicalType::List, _) => {
351 return Err(general_err!(
352 "{:?} cannot be applied to a primitive type for field '{}'",
353 logical_type,
354 self.name
355 ));
356 }
357 (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
358 (LogicalType::Decimal { scale, precision }, _) => {
359 if *scale != self.scale {
361 return Err(general_err!(
362 "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
363 scale,
364 self.scale,
365 self.name
366 ));
367 }
368 if *precision != self.precision {
369 return Err(general_err!(
370 "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
371 precision,
372 self.precision,
373 self.name
374 ));
375 }
376 self.check_decimal_precision_scale()?;
377 }
378 (LogicalType::Date, PhysicalType::INT32) => {}
379 (
380 LogicalType::Time {
381 unit: TimeUnit::MILLIS(_),
382 ..
383 },
384 PhysicalType::INT32,
385 ) => {}
386 (LogicalType::Time { unit, .. }, PhysicalType::INT64) => {
387 if *unit == TimeUnit::MILLIS(Default::default()) {
388 return Err(general_err!(
389 "Cannot use millisecond unit on INT64 type for field '{}'",
390 self.name
391 ));
392 }
393 }
394 (LogicalType::Timestamp { .. }, PhysicalType::INT64) => {}
395 (LogicalType::Integer { bit_width, .. }, PhysicalType::INT32)
396 if *bit_width <= 32 => {}
397 (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
398 if *bit_width == 64 => {}
399 (LogicalType::Unknown, PhysicalType::INT32) => {}
401 (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
402 (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
403 (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
404 (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
405 (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
406 return Err(general_err!(
407 "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
408 self.name
409 ))
410 }
411 (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY)
412 if self.length == 2 => {}
413 (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
414 return Err(general_err!(
415 "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
416 self.name
417 ))
418 }
419 (a, b) => {
420 return Err(general_err!(
421 "Cannot annotate {:?} from {} for field '{}'",
422 a,
423 b,
424 self.name
425 ))
426 }
427 }
428 }
429
430 match self.converted_type {
431 ConvertedType::NONE => {}
432 ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
433 if self.physical_type != PhysicalType::BYTE_ARRAY {
434 return Err(general_err!(
435 "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
436 self.converted_type,
437 self.name
438 ));
439 }
440 }
441 ConvertedType::DECIMAL => {
442 self.check_decimal_precision_scale()?;
443 }
444 ConvertedType::DATE
445 | ConvertedType::TIME_MILLIS
446 | ConvertedType::UINT_8
447 | ConvertedType::UINT_16
448 | ConvertedType::UINT_32
449 | ConvertedType::INT_8
450 | ConvertedType::INT_16
451 | ConvertedType::INT_32 => {
452 if self.physical_type != PhysicalType::INT32 {
453 return Err(general_err!(
454 "{} cannot annotate field '{}' because it is not a INT32 field",
455 self.converted_type,
456 self.name
457 ));
458 }
459 }
460 ConvertedType::TIME_MICROS
461 | ConvertedType::TIMESTAMP_MILLIS
462 | ConvertedType::TIMESTAMP_MICROS
463 | ConvertedType::UINT_64
464 | ConvertedType::INT_64 => {
465 if self.physical_type != PhysicalType::INT64 {
466 return Err(general_err!(
467 "{} cannot annotate field '{}' because it is not a INT64 field",
468 self.converted_type,
469 self.name
470 ));
471 }
472 }
473 ConvertedType::INTERVAL => {
474 if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
475 return Err(general_err!(
476 "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
477 self.name
478 ));
479 }
480 }
481 ConvertedType::ENUM => {
482 if self.physical_type != PhysicalType::BYTE_ARRAY {
483 return Err(general_err!(
484 "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
485 self.name
486 ));
487 }
488 }
489 _ => {
490 return Err(general_err!(
491 "{} cannot be applied to primitive field '{}'",
492 self.converted_type,
493 self.name
494 ));
495 }
496 }
497
498 Ok(Type::PrimitiveType {
499 basic_info,
500 physical_type: self.physical_type,
501 type_length: self.length,
502 scale: self.scale,
503 precision: self.precision,
504 })
505 }
506
507 #[inline]
508 fn check_decimal_precision_scale(&self) -> Result<()> {
509 match self.physical_type {
510 PhysicalType::INT32
511 | PhysicalType::INT64
512 | PhysicalType::BYTE_ARRAY
513 | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
514 _ => {
515 return Err(general_err!(
516 "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
517 ));
518 }
519 }
520
521 if self.precision < 1 {
523 return Err(general_err!(
524 "Invalid DECIMAL precision: {}",
525 self.precision
526 ));
527 }
528
529 if self.scale < 0 {
531 return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
532 }
533
534 if self.scale > self.precision {
535 return Err(general_err!(
536 "Invalid DECIMAL: scale ({}) cannot be greater than precision \
537 ({})",
538 self.scale,
539 self.precision
540 ));
541 }
542
543 match self.physical_type {
545 PhysicalType::INT32 => {
546 if self.precision > 9 {
547 return Err(general_err!(
548 "Cannot represent INT32 as DECIMAL with precision {}",
549 self.precision
550 ));
551 }
552 }
553 PhysicalType::INT64 => {
554 if self.precision > 18 {
555 return Err(general_err!(
556 "Cannot represent INT64 as DECIMAL with precision {}",
557 self.precision
558 ));
559 }
560 }
561 PhysicalType::FIXED_LEN_BYTE_ARRAY => {
562 let length = self
563 .length
564 .checked_mul(8)
565 .ok_or(general_err!("Invalid length {} for Decimal", self.length))?;
566 let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32;
567
568 if self.precision > max_precision {
569 return Err(general_err!(
570 "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
571 precision {}. The max precision can only be {}",
572 self.length,
573 self.precision,
574 max_precision
575 ));
576 }
577 }
578 _ => (), }
580
581 Ok(())
582 }
583}
584
585pub struct GroupTypeBuilder<'a> {
589 name: &'a str,
590 repetition: Option<Repetition>,
591 converted_type: ConvertedType,
592 logical_type: Option<LogicalType>,
593 fields: Vec<TypePtr>,
594 id: Option<i32>,
595}
596
597impl<'a> GroupTypeBuilder<'a> {
598 pub fn new(name: &'a str) -> Self {
600 Self {
601 name,
602 repetition: None,
603 converted_type: ConvertedType::NONE,
604 logical_type: None,
605 fields: Vec::new(),
606 id: None,
607 }
608 }
609
610 pub fn with_repetition(mut self, repetition: Repetition) -> Self {
612 self.repetition = Some(repetition);
613 self
614 }
615
616 pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
618 Self {
619 converted_type,
620 ..self
621 }
622 }
623
624 pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
626 Self {
627 logical_type,
628 ..self
629 }
630 }
631
632 pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
635 Self { fields, ..self }
636 }
637
638 pub fn with_id(self, id: Option<i32>) -> Self {
640 Self { id, ..self }
641 }
642
643 pub fn build(self) -> Result<Type> {
645 let mut basic_info = BasicTypeInfo {
646 name: String::from(self.name),
647 repetition: self.repetition,
648 converted_type: self.converted_type,
649 logical_type: self.logical_type.clone(),
650 id: self.id,
651 };
652 if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
654 basic_info.converted_type = self.logical_type.into();
655 }
656 Ok(Type::GroupType {
657 basic_info,
658 fields: self.fields,
659 })
660 }
661}
662
663#[derive(Clone, Debug, PartialEq, Eq)]
666pub struct BasicTypeInfo {
667 name: String,
668 repetition: Option<Repetition>,
669 converted_type: ConvertedType,
670 logical_type: Option<LogicalType>,
671 id: Option<i32>,
672}
673
674impl HeapSize for BasicTypeInfo {
675 fn heap_size(&self) -> usize {
676 self.name.heap_size()
678 }
679}
680
681impl BasicTypeInfo {
682 pub fn name(&self) -> &str {
684 &self.name
685 }
686
687 pub fn has_repetition(&self) -> bool {
691 self.repetition.is_some()
692 }
693
694 pub fn repetition(&self) -> Repetition {
696 assert!(self.repetition.is_some());
697 self.repetition.unwrap()
698 }
699
700 pub fn converted_type(&self) -> ConvertedType {
702 self.converted_type
703 }
704
705 pub fn logical_type(&self) -> Option<LogicalType> {
707 self.logical_type.clone()
709 }
710
711 pub fn has_id(&self) -> bool {
713 self.id.is_some()
714 }
715
716 pub fn id(&self) -> i32 {
718 assert!(self.id.is_some());
719 self.id.unwrap()
720 }
721}
722
723#[derive(Clone, PartialEq, Debug, Eq, Hash)]
745pub struct ColumnPath {
746 parts: Vec<String>,
747}
748
749impl HeapSize for ColumnPath {
750 fn heap_size(&self) -> usize {
751 self.parts.heap_size()
752 }
753}
754
755impl ColumnPath {
756 pub fn new(parts: Vec<String>) -> Self {
758 ColumnPath { parts }
759 }
760
761 pub fn string(&self) -> String {
769 self.parts.join(".")
770 }
771
772 pub fn append(&mut self, mut tail: Vec<String>) {
784 self.parts.append(&mut tail);
785 }
786
787 pub fn parts(&self) -> &[String] {
789 &self.parts
790 }
791}
792
793impl fmt::Display for ColumnPath {
794 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
795 write!(f, "{:?}", self.string())
796 }
797}
798
799impl From<Vec<String>> for ColumnPath {
800 fn from(parts: Vec<String>) -> Self {
801 ColumnPath { parts }
802 }
803}
804
805impl From<&str> for ColumnPath {
806 fn from(single_path: &str) -> Self {
807 let s = String::from(single_path);
808 ColumnPath::from(s)
809 }
810}
811
812impl From<String> for ColumnPath {
813 fn from(single_path: String) -> Self {
814 let v = vec![single_path];
815 ColumnPath { parts: v }
816 }
817}
818
819impl AsRef<[String]> for ColumnPath {
820 fn as_ref(&self) -> &[String] {
821 &self.parts
822 }
823}
824
825#[derive(Debug, PartialEq)]
830pub struct ColumnDescriptor {
831 primitive_type: TypePtr,
833
834 max_def_level: i16,
836
837 max_rep_level: i16,
839
840 path: ColumnPath,
842}
843
844impl HeapSize for ColumnDescriptor {
845 fn heap_size(&self) -> usize {
846 self.primitive_type.heap_size() + self.path.heap_size()
847 }
848}
849
850impl ColumnDescriptor {
851 pub fn new(
853 primitive_type: TypePtr,
854 max_def_level: i16,
855 max_rep_level: i16,
856 path: ColumnPath,
857 ) -> Self {
858 Self {
859 primitive_type,
860 max_def_level,
861 max_rep_level,
862 path,
863 }
864 }
865
866 #[inline]
868 pub fn max_def_level(&self) -> i16 {
869 self.max_def_level
870 }
871
872 #[inline]
874 pub fn max_rep_level(&self) -> i16 {
875 self.max_rep_level
876 }
877
878 pub fn path(&self) -> &ColumnPath {
880 &self.path
881 }
882
883 pub fn self_type(&self) -> &Type {
885 self.primitive_type.as_ref()
886 }
887
888 pub fn self_type_ptr(&self) -> TypePtr {
891 self.primitive_type.clone()
892 }
893
894 pub fn name(&self) -> &str {
896 self.primitive_type.name()
897 }
898
899 pub fn converted_type(&self) -> ConvertedType {
901 self.primitive_type.get_basic_info().converted_type()
902 }
903
904 pub fn logical_type(&self) -> Option<LogicalType> {
906 self.primitive_type.get_basic_info().logical_type()
907 }
908
909 pub fn physical_type(&self) -> PhysicalType {
912 match self.primitive_type.as_ref() {
913 Type::PrimitiveType { physical_type, .. } => *physical_type,
914 _ => panic!("Expected primitive type!"),
915 }
916 }
917
918 pub fn type_length(&self) -> i32 {
921 match self.primitive_type.as_ref() {
922 Type::PrimitiveType { type_length, .. } => *type_length,
923 _ => panic!("Expected primitive type!"),
924 }
925 }
926
927 pub fn type_precision(&self) -> i32 {
930 match self.primitive_type.as_ref() {
931 Type::PrimitiveType { precision, .. } => *precision,
932 _ => panic!("Expected primitive type!"),
933 }
934 }
935
936 pub fn type_scale(&self) -> i32 {
939 match self.primitive_type.as_ref() {
940 Type::PrimitiveType { scale, .. } => *scale,
941 _ => panic!("Expected primitive type!"),
942 }
943 }
944
945 pub fn sort_order(&self) -> SortOrder {
947 ColumnOrder::get_sort_order(
948 self.logical_type(),
949 self.converted_type(),
950 self.physical_type(),
951 )
952 }
953}
954
955#[derive(PartialEq)]
986pub struct SchemaDescriptor {
987 schema: TypePtr,
992
993 leaves: Vec<ColumnDescPtr>,
997
998 leaf_to_base: Vec<usize>,
1009}
1010
1011impl fmt::Debug for SchemaDescriptor {
1012 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1013 f.debug_struct("SchemaDescriptor")
1015 .field("schema", &self.schema)
1016 .finish()
1017 }
1018}
1019
1020impl HeapSize for SchemaDescriptor {
1022 fn heap_size(&self) -> usize {
1023 self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
1024 }
1025}
1026
1027impl SchemaDescriptor {
1028 pub fn new(tp: TypePtr) -> Self {
1030 assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
1031 let mut leaves = vec![];
1032 let mut leaf_to_base = Vec::new();
1033 for (root_idx, f) in tp.get_fields().iter().enumerate() {
1034 let mut path = vec![];
1035 build_tree(f, root_idx, 0, 0, &mut leaves, &mut leaf_to_base, &mut path);
1036 }
1037
1038 Self {
1039 schema: tp,
1040 leaves,
1041 leaf_to_base,
1042 }
1043 }
1044
1045 pub fn column(&self, i: usize) -> ColumnDescPtr {
1047 assert!(
1048 i < self.leaves.len(),
1049 "Index out of bound: {} not in [0, {})",
1050 i,
1051 self.leaves.len()
1052 );
1053 self.leaves[i].clone()
1054 }
1055
1056 pub fn columns(&self) -> &[ColumnDescPtr] {
1058 &self.leaves
1059 }
1060
1061 pub fn num_columns(&self) -> usize {
1063 self.leaves.len()
1064 }
1065
1066 pub fn get_column_root(&self, i: usize) -> &Type {
1068 let result = self.column_root_of(i);
1069 result.as_ref()
1070 }
1071
1072 pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1074 let result = self.column_root_of(i);
1075 result.clone()
1076 }
1077
1078 pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1080 assert!(
1081 leaf < self.leaves.len(),
1082 "Index out of bound: {} not in [0, {})",
1083 leaf,
1084 self.leaves.len()
1085 );
1086
1087 *self
1088 .leaf_to_base
1089 .get(leaf)
1090 .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1091 }
1092
1093 fn column_root_of(&self, i: usize) -> &TypePtr {
1094 &self.schema.get_fields()[self.get_column_root_idx(i)]
1095 }
1096
1097 pub fn root_schema(&self) -> &Type {
1099 self.schema.as_ref()
1100 }
1101
1102 pub fn root_schema_ptr(&self) -> TypePtr {
1104 self.schema.clone()
1105 }
1106
1107 pub fn name(&self) -> &str {
1109 self.schema.name()
1110 }
1111}
1112
1113fn build_tree<'a>(
1114 tp: &'a TypePtr,
1115 root_idx: usize,
1116 mut max_rep_level: i16,
1117 mut max_def_level: i16,
1118 leaves: &mut Vec<ColumnDescPtr>,
1119 leaf_to_base: &mut Vec<usize>,
1120 path_so_far: &mut Vec<&'a str>,
1121) {
1122 assert!(tp.get_basic_info().has_repetition());
1123
1124 path_so_far.push(tp.name());
1125 match tp.get_basic_info().repetition() {
1126 Repetition::OPTIONAL => {
1127 max_def_level += 1;
1128 }
1129 Repetition::REPEATED => {
1130 max_def_level += 1;
1131 max_rep_level += 1;
1132 }
1133 _ => {}
1134 }
1135
1136 match tp.as_ref() {
1137 Type::PrimitiveType { .. } => {
1138 let mut path: Vec<String> = vec![];
1139 path.extend(path_so_far.iter().copied().map(String::from));
1140 leaves.push(Arc::new(ColumnDescriptor::new(
1141 tp.clone(),
1142 max_def_level,
1143 max_rep_level,
1144 ColumnPath::new(path),
1145 )));
1146 leaf_to_base.push(root_idx);
1147 }
1148 Type::GroupType { ref fields, .. } => {
1149 for f in fields {
1150 build_tree(
1151 f,
1152 root_idx,
1153 max_rep_level,
1154 max_def_level,
1155 leaves,
1156 leaf_to_base,
1157 path_so_far,
1158 );
1159 path_so_far.pop();
1160 }
1161 }
1162 }
1163}
1164
1165pub fn from_thrift(elements: &[SchemaElement]) -> Result<TypePtr> {
1167 let mut index = 0;
1168 let mut schema_nodes = Vec::new();
1169 while index < elements.len() {
1170 let t = from_thrift_helper(elements, index)?;
1171 index = t.0;
1172 schema_nodes.push(t.1);
1173 }
1174 if schema_nodes.len() != 1 {
1175 return Err(general_err!(
1176 "Expected exactly one root node, but found {}",
1177 schema_nodes.len()
1178 ));
1179 }
1180
1181 if !schema_nodes[0].is_group() {
1182 return Err(general_err!("Expected root node to be a group type"));
1183 }
1184
1185 Ok(schema_nodes.remove(0))
1186}
1187
1188fn check_logical_type(logical_type: &Option<LogicalType>) -> Result<()> {
1190 if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type {
1191 if bit_width != 8 && bit_width != 16 && bit_width != 32 && bit_width != 64 {
1192 return Err(general_err!(
1193 "Bit width must be 8, 16, 32, or 64 for Integer logical type"
1194 ));
1195 }
1196 }
1197 Ok(())
1198}
1199
1200fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> {
1205 let is_root_node = index == 0;
1208
1209 if index >= elements.len() {
1210 return Err(general_err!(
1211 "Index out of bound, index = {}, len = {}",
1212 index,
1213 elements.len()
1214 ));
1215 }
1216 let element = &elements[index];
1217
1218 if let (true, None | Some(0)) = (is_root_node, element.num_children) {
1220 let builder = Type::group_type_builder(&element.name);
1221 return Ok((index + 1, Arc::new(builder.build().unwrap())));
1222 }
1223
1224 let converted_type = ConvertedType::try_from(element.converted_type)?;
1225 let logical_type = element
1228 .logical_type
1229 .as_ref()
1230 .map(|value| LogicalType::from(value.clone()));
1231
1232 check_logical_type(&logical_type)?;
1233
1234 let field_id = elements[index].field_id;
1235 match elements[index].num_children {
1236 None | Some(0) => {
1242 if elements[index].repetition_type.is_none() {
1244 return Err(general_err!(
1245 "Repetition level must be defined for a primitive type"
1246 ));
1247 }
1248 let repetition = Repetition::try_from(elements[index].repetition_type.unwrap())?;
1249 if let Some(type_) = elements[index].type_ {
1250 let physical_type = PhysicalType::try_from(type_)?;
1251 let length = elements[index].type_length.unwrap_or(-1);
1252 let scale = elements[index].scale.unwrap_or(-1);
1253 let precision = elements[index].precision.unwrap_or(-1);
1254 let name = &elements[index].name;
1255 let builder = Type::primitive_type_builder(name, physical_type)
1256 .with_repetition(repetition)
1257 .with_converted_type(converted_type)
1258 .with_logical_type(logical_type)
1259 .with_length(length)
1260 .with_precision(precision)
1261 .with_scale(scale)
1262 .with_id(field_id);
1263 Ok((index + 1, Arc::new(builder.build()?)))
1264 } else {
1265 let mut builder = Type::group_type_builder(&elements[index].name)
1266 .with_converted_type(converted_type)
1267 .with_logical_type(logical_type)
1268 .with_id(field_id);
1269 if !is_root_node {
1270 builder = builder.with_repetition(repetition);
1278 }
1279 Ok((index + 1, Arc::new(builder.build().unwrap())))
1280 }
1281 }
1282 Some(n) => {
1283 let repetition = elements[index]
1284 .repetition_type
1285 .map(Repetition::try_from)
1286 .transpose()?;
1287
1288 let mut fields = vec![];
1289 let mut next_index = index + 1;
1290 for _ in 0..n {
1291 let child_result = from_thrift_helper(elements, next_index)?;
1292 next_index = child_result.0;
1293 fields.push(child_result.1);
1294 }
1295
1296 let mut builder = Type::group_type_builder(&elements[index].name)
1297 .with_converted_type(converted_type)
1298 .with_logical_type(logical_type)
1299 .with_fields(fields)
1300 .with_id(field_id);
1301 if let Some(rep) = repetition {
1302 if !is_root_node {
1310 builder = builder.with_repetition(rep);
1311 }
1312 }
1313 Ok((next_index, Arc::new(builder.build().unwrap())))
1314 }
1315 }
1316}
1317
1318pub fn to_thrift(schema: &Type) -> Result<Vec<SchemaElement>> {
1320 if !schema.is_group() {
1321 return Err(general_err!("Root schema must be Group type"));
1322 }
1323 let mut elements: Vec<SchemaElement> = Vec::new();
1324 to_thrift_helper(schema, &mut elements);
1325 Ok(elements)
1326}
1327
1328fn to_thrift_helper(schema: &Type, elements: &mut Vec<SchemaElement>) {
1331 match *schema {
1332 Type::PrimitiveType {
1333 ref basic_info,
1334 physical_type,
1335 type_length,
1336 scale,
1337 precision,
1338 } => {
1339 let element = SchemaElement {
1340 type_: Some(physical_type.into()),
1341 type_length: if type_length >= 0 {
1342 Some(type_length)
1343 } else {
1344 None
1345 },
1346 repetition_type: Some(basic_info.repetition().into()),
1347 name: basic_info.name().to_owned(),
1348 num_children: None,
1349 converted_type: basic_info.converted_type().into(),
1350 scale: if scale >= 0 { Some(scale) } else { None },
1351 precision: if precision >= 0 {
1352 Some(precision)
1353 } else {
1354 None
1355 },
1356 field_id: if basic_info.has_id() {
1357 Some(basic_info.id())
1358 } else {
1359 None
1360 },
1361 logical_type: basic_info.logical_type().map(|value| value.into()),
1362 };
1363
1364 elements.push(element);
1365 }
1366 Type::GroupType {
1367 ref basic_info,
1368 ref fields,
1369 } => {
1370 let repetition = if basic_info.has_repetition() {
1371 Some(basic_info.repetition().into())
1372 } else {
1373 None
1374 };
1375
1376 let element = SchemaElement {
1377 type_: None,
1378 type_length: None,
1379 repetition_type: repetition,
1380 name: basic_info.name().to_owned(),
1381 num_children: Some(fields.len() as i32),
1382 converted_type: basic_info.converted_type().into(),
1383 scale: None,
1384 precision: None,
1385 field_id: if basic_info.has_id() {
1386 Some(basic_info.id())
1387 } else {
1388 None
1389 },
1390 logical_type: basic_info.logical_type().map(|value| value.into()),
1391 };
1392
1393 elements.push(element);
1394
1395 for field in fields {
1397 to_thrift_helper(field, elements);
1398 }
1399 }
1400 }
1401}
1402
1403#[cfg(test)]
1404mod tests {
1405 use super::*;
1406
1407 use crate::schema::parser::parse_message_type;
1408
1409 #[test]
1412 fn test_primitive_type() {
1413 let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1414 .with_logical_type(Some(LogicalType::Integer {
1415 bit_width: 32,
1416 is_signed: true,
1417 }))
1418 .with_id(Some(0))
1419 .build();
1420 assert!(result.is_ok());
1421
1422 if let Ok(tp) = result {
1423 assert!(tp.is_primitive());
1424 assert!(!tp.is_group());
1425 let basic_info = tp.get_basic_info();
1426 assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1427 assert_eq!(
1428 basic_info.logical_type(),
1429 Some(LogicalType::Integer {
1430 bit_width: 32,
1431 is_signed: true
1432 })
1433 );
1434 assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1435 assert_eq!(basic_info.id(), 0);
1436 match tp {
1437 Type::PrimitiveType { physical_type, .. } => {
1438 assert_eq!(physical_type, PhysicalType::INT32);
1439 }
1440 _ => panic!(),
1441 }
1442 }
1443
1444 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1446 .with_repetition(Repetition::REPEATED)
1447 .with_logical_type(Some(LogicalType::Integer {
1448 is_signed: true,
1449 bit_width: 8,
1450 }))
1451 .build();
1452 assert!(result.is_err());
1453 if let Err(e) = result {
1454 assert_eq!(
1455 format!("{e}"),
1456 "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'"
1457 );
1458 }
1459
1460 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1462 .with_repetition(Repetition::REPEATED)
1463 .with_converted_type(ConvertedType::BSON)
1464 .build();
1465 assert!(result.is_err());
1466 if let Err(e) = result {
1467 assert_eq!(
1468 format!("{e}"),
1469 "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1470 );
1471 }
1472
1473 result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1474 .with_repetition(Repetition::REQUIRED)
1475 .with_converted_type(ConvertedType::DECIMAL)
1476 .with_precision(-1)
1477 .with_scale(-1)
1478 .build();
1479 assert!(result.is_err());
1480 if let Err(e) = result {
1481 assert_eq!(
1482 format!("{e}"),
1483 "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1484 );
1485 }
1486
1487 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1488 .with_repetition(Repetition::REQUIRED)
1489 .with_logical_type(Some(LogicalType::Decimal {
1490 scale: 32,
1491 precision: 12,
1492 }))
1493 .with_precision(-1)
1494 .with_scale(-1)
1495 .build();
1496 assert!(result.is_err());
1497 if let Err(e) = result {
1498 assert_eq!(
1499 format!("{e}"),
1500 "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1501 );
1502 }
1503
1504 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1505 .with_repetition(Repetition::REQUIRED)
1506 .with_converted_type(ConvertedType::DECIMAL)
1507 .with_precision(-1)
1508 .with_scale(-1)
1509 .build();
1510 assert!(result.is_err());
1511 if let Err(e) = result {
1512 assert_eq!(
1513 format!("{e}"),
1514 "Parquet error: Invalid DECIMAL precision: -1"
1515 );
1516 }
1517
1518 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1519 .with_repetition(Repetition::REQUIRED)
1520 .with_converted_type(ConvertedType::DECIMAL)
1521 .with_precision(0)
1522 .with_scale(-1)
1523 .build();
1524 assert!(result.is_err());
1525 if let Err(e) = result {
1526 assert_eq!(
1527 format!("{e}"),
1528 "Parquet error: Invalid DECIMAL precision: 0"
1529 );
1530 }
1531
1532 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1533 .with_repetition(Repetition::REQUIRED)
1534 .with_converted_type(ConvertedType::DECIMAL)
1535 .with_precision(1)
1536 .with_scale(-1)
1537 .build();
1538 assert!(result.is_err());
1539 if let Err(e) = result {
1540 assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1541 }
1542
1543 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1544 .with_repetition(Repetition::REQUIRED)
1545 .with_converted_type(ConvertedType::DECIMAL)
1546 .with_precision(1)
1547 .with_scale(2)
1548 .build();
1549 assert!(result.is_err());
1550 if let Err(e) = result {
1551 assert_eq!(
1552 format!("{e}"),
1553 "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1554 );
1555 }
1556
1557 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1559 .with_repetition(Repetition::REQUIRED)
1560 .with_converted_type(ConvertedType::DECIMAL)
1561 .with_precision(1)
1562 .with_scale(1)
1563 .build();
1564 assert!(result.is_ok());
1565
1566 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1567 .with_repetition(Repetition::REQUIRED)
1568 .with_converted_type(ConvertedType::DECIMAL)
1569 .with_precision(18)
1570 .with_scale(2)
1571 .build();
1572 assert!(result.is_err());
1573 if let Err(e) = result {
1574 assert_eq!(
1575 format!("{e}"),
1576 "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1577 );
1578 }
1579
1580 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1581 .with_repetition(Repetition::REQUIRED)
1582 .with_converted_type(ConvertedType::DECIMAL)
1583 .with_precision(32)
1584 .with_scale(2)
1585 .build();
1586 assert!(result.is_err());
1587 if let Err(e) = result {
1588 assert_eq!(
1589 format!("{e}"),
1590 "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1591 );
1592 }
1593
1594 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1595 .with_repetition(Repetition::REQUIRED)
1596 .with_converted_type(ConvertedType::DECIMAL)
1597 .with_length(5)
1598 .with_precision(12)
1599 .with_scale(2)
1600 .build();
1601 assert!(result.is_err());
1602 if let Err(e) = result {
1603 assert_eq!(
1604 format!("{e}"),
1605 "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1606 );
1607 }
1608
1609 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1610 .with_repetition(Repetition::REQUIRED)
1611 .with_converted_type(ConvertedType::UINT_8)
1612 .build();
1613 assert!(result.is_err());
1614 if let Err(e) = result {
1615 assert_eq!(
1616 format!("{e}"),
1617 "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1618 );
1619 }
1620
1621 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1622 .with_repetition(Repetition::REQUIRED)
1623 .with_converted_type(ConvertedType::TIME_MICROS)
1624 .build();
1625 assert!(result.is_err());
1626 if let Err(e) = result {
1627 assert_eq!(
1628 format!("{e}"),
1629 "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1630 );
1631 }
1632
1633 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1634 .with_repetition(Repetition::REQUIRED)
1635 .with_converted_type(ConvertedType::INTERVAL)
1636 .build();
1637 assert!(result.is_err());
1638 if let Err(e) = result {
1639 assert_eq!(
1640 format!("{e}"),
1641 "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1642 );
1643 }
1644
1645 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1646 .with_repetition(Repetition::REQUIRED)
1647 .with_converted_type(ConvertedType::INTERVAL)
1648 .with_length(1)
1649 .build();
1650 assert!(result.is_err());
1651 if let Err(e) = result {
1652 assert_eq!(
1653 format!("{e}"),
1654 "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1655 );
1656 }
1657
1658 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1659 .with_repetition(Repetition::REQUIRED)
1660 .with_converted_type(ConvertedType::ENUM)
1661 .build();
1662 assert!(result.is_err());
1663 if let Err(e) = result {
1664 assert_eq!(
1665 format!("{e}"),
1666 "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1667 );
1668 }
1669
1670 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1671 .with_repetition(Repetition::REQUIRED)
1672 .with_converted_type(ConvertedType::MAP)
1673 .build();
1674 assert!(result.is_err());
1675 if let Err(e) = result {
1676 assert_eq!(
1677 format!("{e}"),
1678 "Parquet error: MAP cannot be applied to primitive field 'foo'"
1679 );
1680 }
1681
1682 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1683 .with_repetition(Repetition::REQUIRED)
1684 .with_converted_type(ConvertedType::DECIMAL)
1685 .with_length(-1)
1686 .build();
1687 assert!(result.is_err());
1688 if let Err(e) = result {
1689 assert_eq!(
1690 format!("{e}"),
1691 "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1692 );
1693 }
1694
1695 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1696 .with_repetition(Repetition::REQUIRED)
1697 .with_logical_type(Some(LogicalType::Float16))
1698 .with_length(2)
1699 .build();
1700 assert!(result.is_ok());
1701
1702 result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1704 .with_repetition(Repetition::REQUIRED)
1705 .with_logical_type(Some(LogicalType::Float16))
1706 .with_length(2)
1707 .build();
1708 assert!(result.is_err());
1709 if let Err(e) = result {
1710 assert_eq!(
1711 format!("{e}"),
1712 "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1713 );
1714 }
1715
1716 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1718 .with_repetition(Repetition::REQUIRED)
1719 .with_logical_type(Some(LogicalType::Float16))
1720 .with_length(4)
1721 .build();
1722 assert!(result.is_err());
1723 if let Err(e) = result {
1724 assert_eq!(
1725 format!("{e}"),
1726 "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1727 );
1728 }
1729
1730 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1732 .with_repetition(Repetition::REQUIRED)
1733 .with_logical_type(Some(LogicalType::Uuid))
1734 .with_length(15)
1735 .build();
1736 assert!(result.is_err());
1737 if let Err(e) = result {
1738 assert_eq!(
1739 format!("{e}"),
1740 "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1741 );
1742 }
1743 }
1744
1745 #[test]
1746 fn test_group_type() {
1747 let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1748 .with_converted_type(ConvertedType::INT_32)
1749 .with_id(Some(0))
1750 .build();
1751 assert!(f1.is_ok());
1752 let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1753 .with_converted_type(ConvertedType::UTF8)
1754 .with_id(Some(1))
1755 .build();
1756 assert!(f2.is_ok());
1757
1758 let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1759
1760 let result = Type::group_type_builder("foo")
1761 .with_repetition(Repetition::REPEATED)
1762 .with_logical_type(Some(LogicalType::List))
1763 .with_fields(fields)
1764 .with_id(Some(1))
1765 .build();
1766 assert!(result.is_ok());
1767
1768 let tp = result.unwrap();
1769 let basic_info = tp.get_basic_info();
1770 assert!(tp.is_group());
1771 assert!(!tp.is_primitive());
1772 assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1773 assert_eq!(basic_info.logical_type(), Some(LogicalType::List));
1774 assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1775 assert_eq!(basic_info.id(), 1);
1776 assert_eq!(tp.get_fields().len(), 2);
1777 assert_eq!(tp.get_fields()[0].name(), "f1");
1778 assert_eq!(tp.get_fields()[1].name(), "f2");
1779 }
1780
1781 #[test]
1782 fn test_column_descriptor() {
1783 let result = test_column_descriptor_helper();
1784 assert!(
1785 result.is_ok(),
1786 "Expected result to be OK but got err:\n {}",
1787 result.unwrap_err()
1788 );
1789 }
1790
1791 fn test_column_descriptor_helper() -> Result<()> {
1792 let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1793 .with_converted_type(ConvertedType::UTF8)
1794 .build()?;
1795
1796 let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1797
1798 assert_eq!(descr.path(), &ColumnPath::from("name"));
1799 assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1800 assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1801 assert_eq!(descr.max_def_level(), 4);
1802 assert_eq!(descr.max_rep_level(), 1);
1803 assert_eq!(descr.name(), "name");
1804 assert_eq!(descr.type_length(), -1);
1805 assert_eq!(descr.type_precision(), -1);
1806 assert_eq!(descr.type_scale(), -1);
1807
1808 Ok(())
1809 }
1810
1811 #[test]
1812 fn test_schema_descriptor() {
1813 let result = test_schema_descriptor_helper();
1814 assert!(
1815 result.is_ok(),
1816 "Expected result to be OK but got err:\n {}",
1817 result.unwrap_err()
1818 );
1819 }
1820
1821 fn test_schema_descriptor_helper() -> Result<()> {
1823 let mut fields = vec![];
1824
1825 let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1826 .with_repetition(Repetition::REQUIRED)
1827 .with_converted_type(ConvertedType::INT_32)
1828 .build()?;
1829 fields.push(Arc::new(inta));
1830 let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1831 .with_converted_type(ConvertedType::INT_64)
1832 .build()?;
1833 fields.push(Arc::new(intb));
1834 let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1835 .with_repetition(Repetition::REPEATED)
1836 .with_converted_type(ConvertedType::UTF8)
1837 .build()?;
1838 fields.push(Arc::new(intc));
1839
1840 let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1842 .with_repetition(Repetition::REQUIRED)
1843 .with_converted_type(ConvertedType::INT_64)
1844 .build()?;
1845 let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1846 let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1847 .with_repetition(Repetition::REPEATED)
1848 .with_converted_type(ConvertedType::INT_32)
1849 .build()?;
1850 let list = Type::group_type_builder("records")
1851 .with_repetition(Repetition::REPEATED)
1852 .with_converted_type(ConvertedType::LIST)
1853 .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1854 .build()?;
1855 let bag = Type::group_type_builder("bag")
1856 .with_repetition(Repetition::OPTIONAL)
1857 .with_fields(vec![Arc::new(list)])
1858 .build()?;
1859 fields.push(Arc::new(bag));
1860
1861 let schema = Type::group_type_builder("schema")
1862 .with_repetition(Repetition::REPEATED)
1863 .with_fields(fields)
1864 .build()?;
1865 let descr = SchemaDescriptor::new(Arc::new(schema));
1866
1867 let nleaves = 6;
1868 assert_eq!(descr.num_columns(), nleaves);
1869
1870 let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1880 let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1881
1882 for i in 0..nleaves {
1883 let col = descr.column(i);
1884 assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1885 assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1886 }
1887
1888 assert_eq!(descr.column(0).path().string(), "a");
1889 assert_eq!(descr.column(1).path().string(), "b");
1890 assert_eq!(descr.column(2).path().string(), "c");
1891 assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1892 assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1893 assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1894
1895 assert_eq!(descr.get_column_root(0).name(), "a");
1896 assert_eq!(descr.get_column_root(3).name(), "bag");
1897 assert_eq!(descr.get_column_root(4).name(), "bag");
1898 assert_eq!(descr.get_column_root(5).name(), "bag");
1899
1900 Ok(())
1901 }
1902
1903 #[test]
1904 fn test_schema_build_tree_def_rep_levels() {
1905 let message_type = "
1906 message spark_schema {
1907 REQUIRED INT32 a;
1908 OPTIONAL group b {
1909 OPTIONAL INT32 _1;
1910 OPTIONAL INT32 _2;
1911 }
1912 OPTIONAL group c (LIST) {
1913 REPEATED group list {
1914 OPTIONAL INT32 element;
1915 }
1916 }
1917 }
1918 ";
1919 let schema = parse_message_type(message_type).expect("should parse schema");
1920 let descr = SchemaDescriptor::new(Arc::new(schema));
1921 assert_eq!(descr.column(0).max_def_level(), 0);
1923 assert_eq!(descr.column(0).max_rep_level(), 0);
1924 assert_eq!(descr.column(1).max_def_level(), 2);
1926 assert_eq!(descr.column(1).max_rep_level(), 0);
1927 assert_eq!(descr.column(2).max_def_level(), 2);
1929 assert_eq!(descr.column(2).max_rep_level(), 0);
1930 assert_eq!(descr.column(3).max_def_level(), 3);
1932 assert_eq!(descr.column(3).max_rep_level(), 1);
1933 }
1934
1935 #[test]
1936 #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
1937 fn test_get_physical_type_panic() {
1938 let list = Type::group_type_builder("records")
1939 .with_repetition(Repetition::REPEATED)
1940 .build()
1941 .unwrap();
1942 list.get_physical_type();
1943 }
1944
1945 #[test]
1946 fn test_get_physical_type_primitive() {
1947 let f = Type::primitive_type_builder("f", PhysicalType::INT64)
1948 .build()
1949 .unwrap();
1950 assert_eq!(f.get_physical_type(), PhysicalType::INT64);
1951
1952 let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
1953 .build()
1954 .unwrap();
1955 assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
1956 }
1957
1958 #[test]
1959 fn test_check_contains_primitive_primitive() {
1960 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1962 .build()
1963 .unwrap();
1964 let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1965 .build()
1966 .unwrap();
1967 assert!(f1.check_contains(&f2));
1968
1969 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1971 .with_converted_type(ConvertedType::UINT_8)
1972 .build()
1973 .unwrap();
1974 let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1975 .with_converted_type(ConvertedType::UINT_16)
1976 .build()
1977 .unwrap();
1978 assert!(f1.check_contains(&f2));
1979
1980 let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1982 .build()
1983 .unwrap();
1984 let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
1985 .build()
1986 .unwrap();
1987 assert!(!f1.check_contains(&f2));
1988
1989 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1991 .build()
1992 .unwrap();
1993 let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
1994 .build()
1995 .unwrap();
1996 assert!(!f1.check_contains(&f2));
1997
1998 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2000 .with_repetition(Repetition::REQUIRED)
2001 .build()
2002 .unwrap();
2003 let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2004 .with_repetition(Repetition::OPTIONAL)
2005 .build()
2006 .unwrap();
2007 assert!(!f1.check_contains(&f2));
2008 }
2009
2010 fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
2012 Type::group_type_builder(name)
2013 .with_repetition(repetition)
2014 .with_fields(types.into_iter().map(Arc::new).collect())
2015 .build()
2016 .unwrap()
2017 }
2018
2019 #[test]
2020 fn test_check_contains_group_group() {
2021 let f1 = Type::group_type_builder("f").build().unwrap();
2023 let f2 = Type::group_type_builder("f").build().unwrap();
2024 assert!(f1.check_contains(&f2));
2025 assert!(!f1.is_optional());
2026
2027 let f1 = test_new_group_type(
2029 "f",
2030 Repetition::REPEATED,
2031 vec![
2032 Type::primitive_type_builder("f1", PhysicalType::INT32)
2033 .build()
2034 .unwrap(),
2035 Type::primitive_type_builder("f2", PhysicalType::INT64)
2036 .build()
2037 .unwrap(),
2038 ],
2039 );
2040 let f2 = test_new_group_type(
2041 "f",
2042 Repetition::REPEATED,
2043 vec![
2044 Type::primitive_type_builder("f1", PhysicalType::INT32)
2045 .build()
2046 .unwrap(),
2047 Type::primitive_type_builder("f2", PhysicalType::INT64)
2048 .build()
2049 .unwrap(),
2050 ],
2051 );
2052 assert!(f1.check_contains(&f2));
2053
2054 let f1 = test_new_group_type(
2056 "f",
2057 Repetition::REPEATED,
2058 vec![
2059 Type::primitive_type_builder("f1", PhysicalType::INT32)
2060 .build()
2061 .unwrap(),
2062 Type::primitive_type_builder("f2", PhysicalType::INT64)
2063 .build()
2064 .unwrap(),
2065 ],
2066 );
2067 let f2 = test_new_group_type(
2068 "f",
2069 Repetition::REPEATED,
2070 vec![Type::primitive_type_builder("f2", PhysicalType::INT64)
2071 .build()
2072 .unwrap()],
2073 );
2074 assert!(f1.check_contains(&f2));
2075
2076 let f1 = Type::group_type_builder("f1").build().unwrap();
2078 let f2 = Type::group_type_builder("f2").build().unwrap();
2079 assert!(!f1.check_contains(&f2));
2080
2081 let f1 = Type::group_type_builder("f")
2083 .with_repetition(Repetition::OPTIONAL)
2084 .build()
2085 .unwrap();
2086 let f2 = Type::group_type_builder("f")
2087 .with_repetition(Repetition::REPEATED)
2088 .build()
2089 .unwrap();
2090 assert!(!f1.check_contains(&f2));
2091
2092 let f1 = test_new_group_type(
2094 "f",
2095 Repetition::REPEATED,
2096 vec![
2097 Type::primitive_type_builder("f1", PhysicalType::INT32)
2098 .build()
2099 .unwrap(),
2100 Type::primitive_type_builder("f2", PhysicalType::INT64)
2101 .build()
2102 .unwrap(),
2103 ],
2104 );
2105 let f2 = test_new_group_type(
2106 "f",
2107 Repetition::REPEATED,
2108 vec![
2109 Type::primitive_type_builder("f1", PhysicalType::INT32)
2110 .build()
2111 .unwrap(),
2112 Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2113 .build()
2114 .unwrap(),
2115 ],
2116 );
2117 assert!(!f1.check_contains(&f2));
2118
2119 let f1 = test_new_group_type(
2121 "f",
2122 Repetition::REPEATED,
2123 vec![
2124 Type::primitive_type_builder("f1", PhysicalType::INT32)
2125 .build()
2126 .unwrap(),
2127 Type::primitive_type_builder("f2", PhysicalType::INT64)
2128 .build()
2129 .unwrap(),
2130 ],
2131 );
2132 let f2 = test_new_group_type(
2133 "f",
2134 Repetition::REPEATED,
2135 vec![Type::primitive_type_builder("f3", PhysicalType::INT32)
2136 .build()
2137 .unwrap()],
2138 );
2139 assert!(!f1.check_contains(&f2));
2140 }
2141
2142 #[test]
2143 fn test_check_contains_group_primitive() {
2144 let f1 = Type::group_type_builder("f").build().unwrap();
2146 let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2147 .build()
2148 .unwrap();
2149 assert!(!f1.check_contains(&f2));
2150 assert!(!f2.check_contains(&f1));
2151
2152 let f1 = test_new_group_type(
2154 "f",
2155 Repetition::REPEATED,
2156 vec![Type::primitive_type_builder("f1", PhysicalType::INT32)
2157 .build()
2158 .unwrap()],
2159 );
2160 let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2161 .build()
2162 .unwrap();
2163 assert!(!f1.check_contains(&f2));
2164 assert!(!f2.check_contains(&f1));
2165
2166 let f1 = test_new_group_type(
2168 "a",
2169 Repetition::REPEATED,
2170 vec![
2171 test_new_group_type(
2172 "b",
2173 Repetition::REPEATED,
2174 vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2175 .build()
2176 .unwrap()],
2177 ),
2178 Type::primitive_type_builder("d", PhysicalType::INT64)
2179 .build()
2180 .unwrap(),
2181 Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2182 .build()
2183 .unwrap(),
2184 ],
2185 );
2186 let f2 = test_new_group_type(
2187 "a",
2188 Repetition::REPEATED,
2189 vec![test_new_group_type(
2190 "b",
2191 Repetition::REPEATED,
2192 vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2193 .build()
2194 .unwrap()],
2195 )],
2196 );
2197 assert!(f1.check_contains(&f2)); assert!(!f2.check_contains(&f1)); }
2200
2201 #[test]
2202 fn test_schema_type_thrift_conversion_err() {
2203 let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2204 .build()
2205 .unwrap();
2206 let thrift_schema = to_thrift(&schema);
2207 assert!(thrift_schema.is_err());
2208 if let Err(e) = thrift_schema {
2209 assert_eq!(
2210 format!("{e}"),
2211 "Parquet error: Root schema must be Group type"
2212 );
2213 }
2214 }
2215
2216 #[test]
2217 fn test_schema_type_thrift_conversion() {
2218 let message_type = "
2219 message conversions {
2220 REQUIRED INT64 id;
2221 OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2222 OPTIONAL group int_array_Array (LIST) {
2223 REPEATED group list {
2224 OPTIONAL group element (LIST) {
2225 REPEATED group list {
2226 OPTIONAL INT32 element;
2227 }
2228 }
2229 }
2230 }
2231 OPTIONAL group int_map (MAP) {
2232 REPEATED group map (MAP_KEY_VALUE) {
2233 REQUIRED BYTE_ARRAY key (UTF8);
2234 OPTIONAL INT32 value;
2235 }
2236 }
2237 OPTIONAL group int_Map_Array (LIST) {
2238 REPEATED group list {
2239 OPTIONAL group g (MAP) {
2240 REPEATED group map (MAP_KEY_VALUE) {
2241 REQUIRED BYTE_ARRAY key (UTF8);
2242 OPTIONAL group value {
2243 OPTIONAL group H {
2244 OPTIONAL group i (LIST) {
2245 REPEATED group list {
2246 OPTIONAL DOUBLE element;
2247 }
2248 }
2249 }
2250 }
2251 }
2252 }
2253 }
2254 }
2255 OPTIONAL group nested_struct {
2256 OPTIONAL INT32 A;
2257 OPTIONAL group b (LIST) {
2258 REPEATED group list {
2259 REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2260 }
2261 }
2262 }
2263 }
2264 ";
2265 let expected_schema = parse_message_type(message_type).unwrap();
2266 let thrift_schema = to_thrift(&expected_schema).unwrap();
2267 let result_schema = from_thrift(&thrift_schema).unwrap();
2268 assert_eq!(result_schema, Arc::new(expected_schema));
2269 }
2270
2271 #[test]
2272 fn test_schema_type_thrift_conversion_decimal() {
2273 let message_type = "
2274 message decimals {
2275 OPTIONAL INT32 field0;
2276 OPTIONAL INT64 field1 (DECIMAL (18, 2));
2277 OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2278 OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2279 }
2280 ";
2281 let expected_schema = parse_message_type(message_type).unwrap();
2282 let thrift_schema = to_thrift(&expected_schema).unwrap();
2283 let result_schema = from_thrift(&thrift_schema).unwrap();
2284 assert_eq!(result_schema, Arc::new(expected_schema));
2285 }
2286
2287 #[test]
2290 fn test_schema_from_thrift_with_num_children_set() {
2291 let message_type = "
2293 message schema {
2294 OPTIONAL BYTE_ARRAY id (UTF8);
2295 OPTIONAL BYTE_ARRAY name (UTF8);
2296 OPTIONAL BYTE_ARRAY message (UTF8);
2297 OPTIONAL INT32 type (UINT_8);
2298 OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2299 OPTIONAL INT64 __index_level_0__;
2300 }
2301 ";
2302
2303 let expected_schema = parse_message_type(message_type).unwrap();
2304 let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2305 for elem in &mut thrift_schema[..] {
2307 if elem.num_children.is_none() {
2308 elem.num_children = Some(0);
2309 }
2310 }
2311
2312 let result_schema = from_thrift(&thrift_schema).unwrap();
2313 assert_eq!(result_schema, Arc::new(expected_schema));
2314 }
2315
2316 #[test]
2319 fn test_schema_from_thrift_root_has_repetition() {
2320 let message_type = "
2322 message schema {
2323 OPTIONAL BYTE_ARRAY a (UTF8);
2324 OPTIONAL INT32 b (UINT_8);
2325 }
2326 ";
2327
2328 let expected_schema = parse_message_type(message_type).unwrap();
2329 let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2330 thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2331
2332 let result_schema = from_thrift(&thrift_schema).unwrap();
2333 assert_eq!(result_schema, Arc::new(expected_schema));
2334 }
2335
2336 #[test]
2337 fn test_schema_from_thrift_group_has_no_child() {
2338 let message_type = "message schema {}";
2339
2340 let expected_schema = parse_message_type(message_type).unwrap();
2341 let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2342 thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2343
2344 let result_schema = from_thrift(&thrift_schema).unwrap();
2345 assert_eq!(result_schema, Arc::new(expected_schema));
2346 }
2347}