parquet/schema/
types.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains structs and methods to build Parquet schema and schema descriptors.
19
20use std::{collections::HashMap, fmt, sync::Arc};
21
22use crate::file::metadata::HeapSize;
23use crate::format::SchemaElement;
24
25use crate::basic::{
26    ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType,
27};
28use crate::errors::{ParquetError, Result};
29
30// ----------------------------------------------------------------------
31// Parquet Type definitions
32
33/// Type alias for `Arc<Type>`.
34pub type TypePtr = Arc<Type>;
35/// Type alias for `Arc<SchemaDescriptor>`.
36pub type SchemaDescPtr = Arc<SchemaDescriptor>;
37/// Type alias for `Arc<ColumnDescriptor>`.
38pub type ColumnDescPtr = Arc<ColumnDescriptor>;
39
40/// Representation of a Parquet type.
41///
42/// Used to describe primitive leaf fields and structs, including top-level schema.
43///
44/// Note that the top-level schema is represented using [`Type::GroupType`] whose
45/// repetition is `None`.
46#[derive(Clone, Debug, PartialEq)]
47pub enum Type {
48    /// Represents a primitive leaf field.
49    PrimitiveType {
50        /// Basic information about the type.
51        basic_info: BasicTypeInfo,
52        /// Physical type of this primitive type.
53        physical_type: PhysicalType,
54        /// Length of this type.
55        type_length: i32,
56        /// Scale of this type.
57        scale: i32,
58        /// Precision of this type.
59        precision: i32,
60    },
61    /// Represents a group of fields (similar to struct).
62    GroupType {
63        /// Basic information about the type.
64        basic_info: BasicTypeInfo,
65        /// Fields of this group type.
66        fields: Vec<TypePtr>,
67    },
68}
69
70impl HeapSize for Type {
71    fn heap_size(&self) -> usize {
72        match self {
73            Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
74            Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
75        }
76    }
77}
78
79impl Type {
80    /// Creates primitive type builder with provided field name and physical type.
81    pub fn primitive_type_builder(
82        name: &str,
83        physical_type: PhysicalType,
84    ) -> PrimitiveTypeBuilder<'_> {
85        PrimitiveTypeBuilder::new(name, physical_type)
86    }
87
88    /// Creates group type builder with provided column name.
89    pub fn group_type_builder(name: &str) -> GroupTypeBuilder<'_> {
90        GroupTypeBuilder::new(name)
91    }
92
93    /// Returns [`BasicTypeInfo`] information about the type.
94    pub fn get_basic_info(&self) -> &BasicTypeInfo {
95        match *self {
96            Type::PrimitiveType { ref basic_info, .. } => basic_info,
97            Type::GroupType { ref basic_info, .. } => basic_info,
98        }
99    }
100
101    /// Returns this type's field name.
102    pub fn name(&self) -> &str {
103        self.get_basic_info().name()
104    }
105
106    /// Gets the fields from this group type.
107    /// Note that this will panic if called on a non-group type.
108    // TODO: should we return `&[&Type]` here?
109    pub fn get_fields(&self) -> &[TypePtr] {
110        match *self {
111            Type::GroupType { ref fields, .. } => &fields[..],
112            _ => panic!("Cannot call get_fields() on a non-group type"),
113        }
114    }
115
116    /// Gets physical type of this primitive type.
117    /// Note that this will panic if called on a non-primitive type.
118    pub fn get_physical_type(&self) -> PhysicalType {
119        match *self {
120            Type::PrimitiveType {
121                basic_info: _,
122                physical_type,
123                ..
124            } => physical_type,
125            _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
126        }
127    }
128
129    /// Gets precision of this primitive type.
130    /// Note that this will panic if called on a non-primitive type.
131    pub fn get_precision(&self) -> i32 {
132        match *self {
133            Type::PrimitiveType { precision, .. } => precision,
134            _ => panic!("Cannot call get_precision() on non-primitive type"),
135        }
136    }
137
138    /// Gets scale of this primitive type.
139    /// Note that this will panic if called on a non-primitive type.
140    pub fn get_scale(&self) -> i32 {
141        match *self {
142            Type::PrimitiveType { scale, .. } => scale,
143            _ => panic!("Cannot call get_scale() on non-primitive type"),
144        }
145    }
146
147    /// Checks if `sub_type` schema is part of current schema.
148    /// This method can be used to check if projected columns are part of the root schema.
149    pub fn check_contains(&self, sub_type: &Type) -> bool {
150        // Names match, and repetitions match or not set for both
151        let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
152            && (self.is_schema() && sub_type.is_schema()
153                || !self.is_schema()
154                    && !sub_type.is_schema()
155                    && self.get_basic_info().repetition()
156                        == sub_type.get_basic_info().repetition());
157
158        match *self {
159            Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
160                self.get_physical_type() == sub_type.get_physical_type()
161            }
162            Type::GroupType { .. } if basic_match && sub_type.is_group() => {
163                // build hashmap of name -> TypePtr
164                let mut field_map = HashMap::new();
165                for field in self.get_fields() {
166                    field_map.insert(field.name(), field);
167                }
168
169                for field in sub_type.get_fields() {
170                    if !field_map
171                        .get(field.name())
172                        .map(|tpe| tpe.check_contains(field))
173                        .unwrap_or(false)
174                    {
175                        return false;
176                    }
177                }
178                true
179            }
180            _ => false,
181        }
182    }
183
184    /// Returns `true` if this type is a primitive type, `false` otherwise.
185    pub fn is_primitive(&self) -> bool {
186        matches!(*self, Type::PrimitiveType { .. })
187    }
188
189    /// Returns `true` if this type is a group type, `false` otherwise.
190    pub fn is_group(&self) -> bool {
191        matches!(*self, Type::GroupType { .. })
192    }
193
194    /// Returns `true` if this type is the top-level schema type (message type).
195    pub fn is_schema(&self) -> bool {
196        match *self {
197            Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
198            _ => false,
199        }
200    }
201
202    /// Returns `true` if this type is repeated or optional.
203    /// If this type doesn't have repetition defined, we treat it as required.
204    pub fn is_optional(&self) -> bool {
205        self.get_basic_info().has_repetition()
206            && self.get_basic_info().repetition() != Repetition::REQUIRED
207    }
208
209    /// Returns `true` if this type is annotated as a list.
210    pub(crate) fn is_list(&self) -> bool {
211        if self.is_group() {
212            let basic_info = self.get_basic_info();
213            if let Some(logical_type) = basic_info.logical_type() {
214                return logical_type == LogicalType::List;
215            }
216            return basic_info.converted_type() == ConvertedType::LIST;
217        }
218        false
219    }
220
221    /// Returns `true` if this type is a group with a single child field that is `repeated`.
222    pub(crate) fn has_single_repeated_child(&self) -> bool {
223        if self.is_group() {
224            let children = self.get_fields();
225            return children.len() == 1
226                && children[0].get_basic_info().has_repetition()
227                && children[0].get_basic_info().repetition() == Repetition::REPEATED;
228        }
229        false
230    }
231}
232
233/// A builder for primitive types. All attributes are optional
234/// except the name and physical type.
235/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used.
236pub struct PrimitiveTypeBuilder<'a> {
237    name: &'a str,
238    repetition: Repetition,
239    physical_type: PhysicalType,
240    converted_type: ConvertedType,
241    logical_type: Option<LogicalType>,
242    length: i32,
243    precision: i32,
244    scale: i32,
245    id: Option<i32>,
246}
247
248impl<'a> PrimitiveTypeBuilder<'a> {
249    /// Creates new primitive type builder with provided field name and physical type.
250    pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
251        Self {
252            name,
253            repetition: Repetition::OPTIONAL,
254            physical_type,
255            converted_type: ConvertedType::NONE,
256            logical_type: None,
257            length: -1,
258            precision: -1,
259            scale: -1,
260            id: None,
261        }
262    }
263
264    /// Sets [`Repetition`] for this field and returns itself.
265    pub fn with_repetition(self, repetition: Repetition) -> Self {
266        Self { repetition, ..self }
267    }
268
269    /// Sets [`ConvertedType`] for this field and returns itself.
270    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
271        Self {
272            converted_type,
273            ..self
274        }
275    }
276
277    /// Sets [`LogicalType`] for this field and returns itself.
278    /// If only the logical type is populated for a primitive type, the converted type
279    /// will be automatically populated, and can thus be omitted.
280    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
281        Self {
282            logical_type,
283            ..self
284        }
285    }
286
287    /// Sets type length and returns itself.
288    /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because
289    /// they maintain fixed size underlying byte array.
290    /// By default, value is `0`.
291    pub fn with_length(self, length: i32) -> Self {
292        Self { length, ..self }
293    }
294
295    /// Sets precision for Parquet DECIMAL physical type and returns itself.
296    /// By default, it equals to `0` and used only for decimal context.
297    pub fn with_precision(self, precision: i32) -> Self {
298        Self { precision, ..self }
299    }
300
301    /// Sets scale for Parquet DECIMAL physical type and returns itself.
302    /// By default, it equals to `0` and used only for decimal context.
303    pub fn with_scale(self, scale: i32) -> Self {
304        Self { scale, ..self }
305    }
306
307    /// Sets optional field id and returns itself.
308    pub fn with_id(self, id: Option<i32>) -> Self {
309        Self { id, ..self }
310    }
311
312    /// Creates a new `PrimitiveType` instance from the collected attributes.
313    /// Returns `Err` in case of any building conditions are not met.
314    pub fn build(self) -> Result<Type> {
315        let mut basic_info = BasicTypeInfo {
316            name: String::from(self.name),
317            repetition: Some(self.repetition),
318            converted_type: self.converted_type,
319            logical_type: self.logical_type.clone(),
320            id: self.id,
321        };
322
323        // Check length before logical type, since it is used for logical type validation.
324        if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
325            return Err(general_err!(
326                "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
327                self.length,
328                self.name
329            ));
330        }
331
332        if let Some(logical_type) = &self.logical_type {
333            // If a converted type is populated, check that it is consistent with
334            // its logical type
335            if self.converted_type != ConvertedType::NONE {
336                if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
337                    return Err(general_err!(
338                        "Logical type {:?} is incompatible with converted type {} for field '{}'",
339                        logical_type,
340                        self.converted_type,
341                        self.name
342                    ));
343                }
344            } else {
345                // Populate the converted type for backwards compatibility
346                basic_info.converted_type = self.logical_type.clone().into();
347            }
348            // Check that logical type and physical type are compatible
349            match (logical_type, self.physical_type) {
350                (LogicalType::Map, _) | (LogicalType::List, _) => {
351                    return Err(general_err!(
352                        "{:?} cannot be applied to a primitive type for field '{}'",
353                        logical_type,
354                        self.name
355                    ));
356                }
357                (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
358                (LogicalType::Decimal { scale, precision }, _) => {
359                    // Check that scale and precision are consistent with legacy values
360                    if *scale != self.scale {
361                        return Err(general_err!(
362                            "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
363                            scale,
364                            self.scale,
365                            self.name
366                        ));
367                    }
368                    if *precision != self.precision {
369                        return Err(general_err!(
370                            "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
371                            precision,
372                            self.precision,
373                            self.name
374                        ));
375                    }
376                    self.check_decimal_precision_scale()?;
377                }
378                (LogicalType::Date, PhysicalType::INT32) => {}
379                (
380                    LogicalType::Time {
381                        unit: TimeUnit::MILLIS(_),
382                        ..
383                    },
384                    PhysicalType::INT32,
385                ) => {}
386                (LogicalType::Time { unit, .. }, PhysicalType::INT64) => {
387                    if *unit == TimeUnit::MILLIS(Default::default()) {
388                        return Err(general_err!(
389                            "Cannot use millisecond unit on INT64 type for field '{}'",
390                            self.name
391                        ));
392                    }
393                }
394                (LogicalType::Timestamp { .. }, PhysicalType::INT64) => {}
395                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT32)
396                    if *bit_width <= 32 => {}
397                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
398                    if *bit_width == 64 => {}
399                // Null type
400                (LogicalType::Unknown, PhysicalType::INT32) => {}
401                (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
402                (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
403                (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
404                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
405                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
406                    return Err(general_err!(
407                        "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
408                        self.name
409                    ))
410                }
411                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY)
412                    if self.length == 2 => {}
413                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
414                    return Err(general_err!(
415                        "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
416                        self.name
417                    ))
418                }
419                (a, b) => {
420                    return Err(general_err!(
421                        "Cannot annotate {:?} from {} for field '{}'",
422                        a,
423                        b,
424                        self.name
425                    ))
426                }
427            }
428        }
429
430        match self.converted_type {
431            ConvertedType::NONE => {}
432            ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
433                if self.physical_type != PhysicalType::BYTE_ARRAY {
434                    return Err(general_err!(
435                        "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
436                        self.converted_type,
437                        self.name
438                    ));
439                }
440            }
441            ConvertedType::DECIMAL => {
442                self.check_decimal_precision_scale()?;
443            }
444            ConvertedType::DATE
445            | ConvertedType::TIME_MILLIS
446            | ConvertedType::UINT_8
447            | ConvertedType::UINT_16
448            | ConvertedType::UINT_32
449            | ConvertedType::INT_8
450            | ConvertedType::INT_16
451            | ConvertedType::INT_32 => {
452                if self.physical_type != PhysicalType::INT32 {
453                    return Err(general_err!(
454                        "{} cannot annotate field '{}' because it is not a INT32 field",
455                        self.converted_type,
456                        self.name
457                    ));
458                }
459            }
460            ConvertedType::TIME_MICROS
461            | ConvertedType::TIMESTAMP_MILLIS
462            | ConvertedType::TIMESTAMP_MICROS
463            | ConvertedType::UINT_64
464            | ConvertedType::INT_64 => {
465                if self.physical_type != PhysicalType::INT64 {
466                    return Err(general_err!(
467                        "{} cannot annotate field '{}' because it is not a INT64 field",
468                        self.converted_type,
469                        self.name
470                    ));
471                }
472            }
473            ConvertedType::INTERVAL => {
474                if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
475                    return Err(general_err!(
476                        "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
477                        self.name
478                    ));
479                }
480            }
481            ConvertedType::ENUM => {
482                if self.physical_type != PhysicalType::BYTE_ARRAY {
483                    return Err(general_err!(
484                        "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
485                        self.name
486                    ));
487                }
488            }
489            _ => {
490                return Err(general_err!(
491                    "{} cannot be applied to primitive field '{}'",
492                    self.converted_type,
493                    self.name
494                ));
495            }
496        }
497
498        Ok(Type::PrimitiveType {
499            basic_info,
500            physical_type: self.physical_type,
501            type_length: self.length,
502            scale: self.scale,
503            precision: self.precision,
504        })
505    }
506
507    #[inline]
508    fn check_decimal_precision_scale(&self) -> Result<()> {
509        match self.physical_type {
510            PhysicalType::INT32
511            | PhysicalType::INT64
512            | PhysicalType::BYTE_ARRAY
513            | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
514            _ => {
515                return Err(general_err!(
516                    "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
517                ));
518            }
519        }
520
521        // Precision is required and must be a non-zero positive integer.
522        if self.precision < 1 {
523            return Err(general_err!(
524                "Invalid DECIMAL precision: {}",
525                self.precision
526            ));
527        }
528
529        // Scale must be zero or a positive integer less than the precision.
530        if self.scale < 0 {
531            return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
532        }
533
534        if self.scale > self.precision {
535            return Err(general_err!(
536                "Invalid DECIMAL: scale ({}) cannot be greater than precision \
537             ({})",
538                self.scale,
539                self.precision
540            ));
541        }
542
543        // Check precision and scale based on physical type limitations.
544        match self.physical_type {
545            PhysicalType::INT32 => {
546                if self.precision > 9 {
547                    return Err(general_err!(
548                        "Cannot represent INT32 as DECIMAL with precision {}",
549                        self.precision
550                    ));
551                }
552            }
553            PhysicalType::INT64 => {
554                if self.precision > 18 {
555                    return Err(general_err!(
556                        "Cannot represent INT64 as DECIMAL with precision {}",
557                        self.precision
558                    ));
559                }
560            }
561            PhysicalType::FIXED_LEN_BYTE_ARRAY => {
562                let length = self
563                    .length
564                    .checked_mul(8)
565                    .ok_or(general_err!("Invalid length {} for Decimal", self.length))?;
566                let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32;
567
568                if self.precision > max_precision {
569                    return Err(general_err!(
570                        "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
571                        precision {}. The max precision can only be {}",
572                        self.length,
573                        self.precision,
574                        max_precision
575                    ));
576                }
577            }
578            _ => (), // For BYTE_ARRAY precision is not limited
579        }
580
581        Ok(())
582    }
583}
584
585/// A builder for group types. All attributes are optional except the name.
586/// Note that if not specified explicitly, `None` is used as the repetition of the group,
587/// which means it is a root (message) type.
588pub struct GroupTypeBuilder<'a> {
589    name: &'a str,
590    repetition: Option<Repetition>,
591    converted_type: ConvertedType,
592    logical_type: Option<LogicalType>,
593    fields: Vec<TypePtr>,
594    id: Option<i32>,
595}
596
597impl<'a> GroupTypeBuilder<'a> {
598    /// Creates new group type builder with provided field name.
599    pub fn new(name: &'a str) -> Self {
600        Self {
601            name,
602            repetition: None,
603            converted_type: ConvertedType::NONE,
604            logical_type: None,
605            fields: Vec::new(),
606            id: None,
607        }
608    }
609
610    /// Sets [`Repetition`] for this field and returns itself.
611    pub fn with_repetition(mut self, repetition: Repetition) -> Self {
612        self.repetition = Some(repetition);
613        self
614    }
615
616    /// Sets [`ConvertedType`] for this field and returns itself.
617    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
618        Self {
619            converted_type,
620            ..self
621        }
622    }
623
624    /// Sets [`LogicalType`] for this field and returns itself.
625    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
626        Self {
627            logical_type,
628            ..self
629        }
630    }
631
632    /// Sets a list of fields that should be child nodes of this field.
633    /// Returns updated self.
634    pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
635        Self { fields, ..self }
636    }
637
638    /// Sets optional field id and returns itself.
639    pub fn with_id(self, id: Option<i32>) -> Self {
640        Self { id, ..self }
641    }
642
643    /// Creates a new `GroupType` instance from the gathered attributes.
644    pub fn build(self) -> Result<Type> {
645        let mut basic_info = BasicTypeInfo {
646            name: String::from(self.name),
647            repetition: self.repetition,
648            converted_type: self.converted_type,
649            logical_type: self.logical_type.clone(),
650            id: self.id,
651        };
652        // Populate the converted type if only the logical type is populated
653        if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
654            basic_info.converted_type = self.logical_type.into();
655        }
656        Ok(Type::GroupType {
657            basic_info,
658            fields: self.fields,
659        })
660    }
661}
662
663/// Basic type info. This contains information such as the name of the type,
664/// the repetition level, the logical type and the kind of the type (group, primitive).
665#[derive(Clone, Debug, PartialEq, Eq)]
666pub struct BasicTypeInfo {
667    name: String,
668    repetition: Option<Repetition>,
669    converted_type: ConvertedType,
670    logical_type: Option<LogicalType>,
671    id: Option<i32>,
672}
673
674impl HeapSize for BasicTypeInfo {
675    fn heap_size(&self) -> usize {
676        // no heap allocations in any other subfield
677        self.name.heap_size()
678    }
679}
680
681impl BasicTypeInfo {
682    /// Returns field name.
683    pub fn name(&self) -> &str {
684        &self.name
685    }
686
687    /// Returns `true` if type has repetition field set, `false` otherwise.
688    /// This is mostly applied to group type, because primitive type always has
689    /// repetition set.
690    pub fn has_repetition(&self) -> bool {
691        self.repetition.is_some()
692    }
693
694    /// Returns [`Repetition`] value for the type.
695    pub fn repetition(&self) -> Repetition {
696        assert!(self.repetition.is_some());
697        self.repetition.unwrap()
698    }
699
700    /// Returns [`ConvertedType`] value for the type.
701    pub fn converted_type(&self) -> ConvertedType {
702        self.converted_type
703    }
704
705    /// Returns [`LogicalType`] value for the type.
706    pub fn logical_type(&self) -> Option<LogicalType> {
707        // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it
708        self.logical_type.clone()
709    }
710
711    /// Returns `true` if id is set, `false` otherwise.
712    pub fn has_id(&self) -> bool {
713        self.id.is_some()
714    }
715
716    /// Returns id value for the type.
717    pub fn id(&self) -> i32 {
718        assert!(self.id.is_some());
719        self.id.unwrap()
720    }
721}
722
723// ----------------------------------------------------------------------
724// Parquet descriptor definitions
725
726/// Represents the location of a column in a Parquet schema
727///
728/// # Example: refer to column named `'my_column'`
729/// ```
730/// # use parquet::schema::types::ColumnPath;
731/// let column_path = ColumnPath::from("my_column");
732/// ```
733///
734/// # Example: refer to column named `c` in a nested struct `{a: {b: {c: ...}}}`
735/// ```
736/// # use parquet::schema::types::ColumnPath;
737/// // form path 'a.b.c'
738/// let column_path = ColumnPath::from(vec![
739///   String::from("a"),
740///   String::from("b"),
741///   String::from("c")
742/// ]);
743/// ```
744#[derive(Clone, PartialEq, Debug, Eq, Hash)]
745pub struct ColumnPath {
746    parts: Vec<String>,
747}
748
749impl HeapSize for ColumnPath {
750    fn heap_size(&self) -> usize {
751        self.parts.heap_size()
752    }
753}
754
755impl ColumnPath {
756    /// Creates new column path from vector of field names.
757    pub fn new(parts: Vec<String>) -> Self {
758        ColumnPath { parts }
759    }
760
761    /// Returns string representation of this column path.
762    /// ```rust
763    /// use parquet::schema::types::ColumnPath;
764    ///
765    /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
766    /// assert_eq!(&path.string(), "a.b.c");
767    /// ```
768    pub fn string(&self) -> String {
769        self.parts.join(".")
770    }
771
772    /// Appends more components to end of column path.
773    /// ```rust
774    /// use parquet::schema::types::ColumnPath;
775    ///
776    /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c"
777    /// .to_string()]);
778    /// assert_eq!(&path.string(), "a.b.c");
779    ///
780    /// path.append(vec!["d".to_string(), "e".to_string()]);
781    /// assert_eq!(&path.string(), "a.b.c.d.e");
782    /// ```
783    pub fn append(&mut self, mut tail: Vec<String>) {
784        self.parts.append(&mut tail);
785    }
786
787    /// Returns a slice of path components.
788    pub fn parts(&self) -> &[String] {
789        &self.parts
790    }
791}
792
793impl fmt::Display for ColumnPath {
794    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
795        write!(f, "{:?}", self.string())
796    }
797}
798
799impl From<Vec<String>> for ColumnPath {
800    fn from(parts: Vec<String>) -> Self {
801        ColumnPath { parts }
802    }
803}
804
805impl From<&str> for ColumnPath {
806    fn from(single_path: &str) -> Self {
807        let s = String::from(single_path);
808        ColumnPath::from(s)
809    }
810}
811
812impl From<String> for ColumnPath {
813    fn from(single_path: String) -> Self {
814        let v = vec![single_path];
815        ColumnPath { parts: v }
816    }
817}
818
819impl AsRef<[String]> for ColumnPath {
820    fn as_ref(&self) -> &[String] {
821        &self.parts
822    }
823}
824
825/// Physical type for leaf-level primitive columns.
826///
827/// Also includes the maximum definition and repetition levels required to
828/// re-assemble nested data.
829#[derive(Debug, PartialEq)]
830pub struct ColumnDescriptor {
831    /// The "leaf" primitive type of this column
832    primitive_type: TypePtr,
833
834    /// The maximum definition level for this column
835    max_def_level: i16,
836
837    /// The maximum repetition level for this column
838    max_rep_level: i16,
839
840    /// The path of this column. For instance, "a.b.c.d".
841    path: ColumnPath,
842}
843
844impl HeapSize for ColumnDescriptor {
845    fn heap_size(&self) -> usize {
846        self.primitive_type.heap_size() + self.path.heap_size()
847    }
848}
849
850impl ColumnDescriptor {
851    /// Creates new descriptor for leaf-level column.
852    pub fn new(
853        primitive_type: TypePtr,
854        max_def_level: i16,
855        max_rep_level: i16,
856        path: ColumnPath,
857    ) -> Self {
858        Self {
859            primitive_type,
860            max_def_level,
861            max_rep_level,
862            path,
863        }
864    }
865
866    /// Returns maximum definition level for this column.
867    #[inline]
868    pub fn max_def_level(&self) -> i16 {
869        self.max_def_level
870    }
871
872    /// Returns maximum repetition level for this column.
873    #[inline]
874    pub fn max_rep_level(&self) -> i16 {
875        self.max_rep_level
876    }
877
878    /// Returns [`ColumnPath`] for this column.
879    pub fn path(&self) -> &ColumnPath {
880        &self.path
881    }
882
883    /// Returns self type [`Type`] for this leaf column.
884    pub fn self_type(&self) -> &Type {
885        self.primitive_type.as_ref()
886    }
887
888    /// Returns self type [`TypePtr`]  for this leaf
889    /// column.
890    pub fn self_type_ptr(&self) -> TypePtr {
891        self.primitive_type.clone()
892    }
893
894    /// Returns column name.
895    pub fn name(&self) -> &str {
896        self.primitive_type.name()
897    }
898
899    /// Returns [`ConvertedType`] for this column.
900    pub fn converted_type(&self) -> ConvertedType {
901        self.primitive_type.get_basic_info().converted_type()
902    }
903
904    /// Returns [`LogicalType`] for this column.
905    pub fn logical_type(&self) -> Option<LogicalType> {
906        self.primitive_type.get_basic_info().logical_type()
907    }
908
909    /// Returns physical type for this column.
910    /// Note that it will panic if called on a non-primitive type.
911    pub fn physical_type(&self) -> PhysicalType {
912        match self.primitive_type.as_ref() {
913            Type::PrimitiveType { physical_type, .. } => *physical_type,
914            _ => panic!("Expected primitive type!"),
915        }
916    }
917
918    /// Returns type length for this column.
919    /// Note that it will panic if called on a non-primitive type.
920    pub fn type_length(&self) -> i32 {
921        match self.primitive_type.as_ref() {
922            Type::PrimitiveType { type_length, .. } => *type_length,
923            _ => panic!("Expected primitive type!"),
924        }
925    }
926
927    /// Returns type precision for this column.
928    /// Note that it will panic if called on a non-primitive type.
929    pub fn type_precision(&self) -> i32 {
930        match self.primitive_type.as_ref() {
931            Type::PrimitiveType { precision, .. } => *precision,
932            _ => panic!("Expected primitive type!"),
933        }
934    }
935
936    /// Returns type scale for this column.
937    /// Note that it will panic if called on a non-primitive type.
938    pub fn type_scale(&self) -> i32 {
939        match self.primitive_type.as_ref() {
940            Type::PrimitiveType { scale, .. } => *scale,
941            _ => panic!("Expected primitive type!"),
942        }
943    }
944
945    /// Returns the sort order for this column
946    pub fn sort_order(&self) -> SortOrder {
947        ColumnOrder::get_sort_order(
948            self.logical_type(),
949            self.converted_type(),
950            self.physical_type(),
951        )
952    }
953}
954
955/// Schema of a Parquet file.
956///
957/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
958/// each primitive (leaf) column.
959///
960/// # Example
961/// ```
962/// # use std::sync::Arc;
963/// use parquet::schema::types::{SchemaDescriptor, Type};
964/// use parquet::basic; // note there are two `Type`s that are different
965/// // Schema for a table with two columns: "a" (int64) and "b" (int32, stored as a date)
966/// let descriptor = SchemaDescriptor::new(
967///   Arc::new(
968///     Type::group_type_builder("my_schema")
969///       .with_fields(vec![
970///         Arc::new(
971///          Type::primitive_type_builder("a", basic::Type::INT64)
972///           .build().unwrap()
973///         ),
974///         Arc::new(
975///          Type::primitive_type_builder("b", basic::Type::INT32)
976///           .with_converted_type(basic::ConvertedType::DATE)
977///           .with_logical_type(Some(basic::LogicalType::Date))
978///           .build().unwrap()
979///         ),
980///      ])
981///      .build().unwrap()
982///   )
983/// );
984/// ```
985#[derive(PartialEq)]
986pub struct SchemaDescriptor {
987    /// The top-level logical schema (the "message" type).
988    ///
989    /// This must be a [`Type::GroupType`] where each field is a root
990    /// column type in the schema.
991    schema: TypePtr,
992
993    /// The descriptors for the physical type of each leaf column in this schema
994    ///
995    /// Constructed from `schema` in DFS order.
996    leaves: Vec<ColumnDescPtr>,
997
998    /// Mapping from a leaf column's index to the root column index that it
999    /// comes from.
1000    ///
1001    /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
1002    /// ```text
1003    /// -- a  <-----+
1004    /// -- -- b     |
1005    /// -- -- -- c  |
1006    /// -- -- -- -- d
1007    /// ```
1008    leaf_to_base: Vec<usize>,
1009}
1010
1011impl fmt::Debug for SchemaDescriptor {
1012    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1013        // Skip leaves and leaf_to_base as they only a cache information already found in `schema`
1014        f.debug_struct("SchemaDescriptor")
1015            .field("schema", &self.schema)
1016            .finish()
1017    }
1018}
1019
1020// Need to implement HeapSize in this module as the fields are private
1021impl HeapSize for SchemaDescriptor {
1022    fn heap_size(&self) -> usize {
1023        self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
1024    }
1025}
1026
1027impl SchemaDescriptor {
1028    /// Creates new schema descriptor from Parquet schema.
1029    pub fn new(tp: TypePtr) -> Self {
1030        assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
1031        let mut leaves = vec![];
1032        let mut leaf_to_base = Vec::new();
1033        for (root_idx, f) in tp.get_fields().iter().enumerate() {
1034            let mut path = vec![];
1035            build_tree(f, root_idx, 0, 0, &mut leaves, &mut leaf_to_base, &mut path);
1036        }
1037
1038        Self {
1039            schema: tp,
1040            leaves,
1041            leaf_to_base,
1042        }
1043    }
1044
1045    /// Returns [`ColumnDescriptor`] for a field position.
1046    pub fn column(&self, i: usize) -> ColumnDescPtr {
1047        assert!(
1048            i < self.leaves.len(),
1049            "Index out of bound: {} not in [0, {})",
1050            i,
1051            self.leaves.len()
1052        );
1053        self.leaves[i].clone()
1054    }
1055
1056    /// Returns slice of [`ColumnDescriptor`].
1057    pub fn columns(&self) -> &[ColumnDescPtr] {
1058        &self.leaves
1059    }
1060
1061    /// Returns number of leaf-level columns.
1062    pub fn num_columns(&self) -> usize {
1063        self.leaves.len()
1064    }
1065
1066    /// Returns column root [`Type`] for a leaf position.
1067    pub fn get_column_root(&self, i: usize) -> &Type {
1068        let result = self.column_root_of(i);
1069        result.as_ref()
1070    }
1071
1072    /// Returns column root [`Type`] pointer for a leaf position.
1073    pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1074        let result = self.column_root_of(i);
1075        result.clone()
1076    }
1077
1078    /// Returns the index of the root column for a field position
1079    pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1080        assert!(
1081            leaf < self.leaves.len(),
1082            "Index out of bound: {} not in [0, {})",
1083            leaf,
1084            self.leaves.len()
1085        );
1086
1087        *self
1088            .leaf_to_base
1089            .get(leaf)
1090            .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1091    }
1092
1093    fn column_root_of(&self, i: usize) -> &TypePtr {
1094        &self.schema.get_fields()[self.get_column_root_idx(i)]
1095    }
1096
1097    /// Returns schema as [`Type`].
1098    pub fn root_schema(&self) -> &Type {
1099        self.schema.as_ref()
1100    }
1101
1102    /// Returns schema as [`TypePtr`] for cheap cloning.
1103    pub fn root_schema_ptr(&self) -> TypePtr {
1104        self.schema.clone()
1105    }
1106
1107    /// Returns schema name.
1108    pub fn name(&self) -> &str {
1109        self.schema.name()
1110    }
1111}
1112
1113fn build_tree<'a>(
1114    tp: &'a TypePtr,
1115    root_idx: usize,
1116    mut max_rep_level: i16,
1117    mut max_def_level: i16,
1118    leaves: &mut Vec<ColumnDescPtr>,
1119    leaf_to_base: &mut Vec<usize>,
1120    path_so_far: &mut Vec<&'a str>,
1121) {
1122    assert!(tp.get_basic_info().has_repetition());
1123
1124    path_so_far.push(tp.name());
1125    match tp.get_basic_info().repetition() {
1126        Repetition::OPTIONAL => {
1127            max_def_level += 1;
1128        }
1129        Repetition::REPEATED => {
1130            max_def_level += 1;
1131            max_rep_level += 1;
1132        }
1133        _ => {}
1134    }
1135
1136    match tp.as_ref() {
1137        Type::PrimitiveType { .. } => {
1138            let mut path: Vec<String> = vec![];
1139            path.extend(path_so_far.iter().copied().map(String::from));
1140            leaves.push(Arc::new(ColumnDescriptor::new(
1141                tp.clone(),
1142                max_def_level,
1143                max_rep_level,
1144                ColumnPath::new(path),
1145            )));
1146            leaf_to_base.push(root_idx);
1147        }
1148        Type::GroupType { ref fields, .. } => {
1149            for f in fields {
1150                build_tree(
1151                    f,
1152                    root_idx,
1153                    max_rep_level,
1154                    max_def_level,
1155                    leaves,
1156                    leaf_to_base,
1157                    path_so_far,
1158                );
1159                path_so_far.pop();
1160            }
1161        }
1162    }
1163}
1164
1165/// Method to convert from Thrift.
1166pub fn from_thrift(elements: &[SchemaElement]) -> Result<TypePtr> {
1167    let mut index = 0;
1168    let mut schema_nodes = Vec::new();
1169    while index < elements.len() {
1170        let t = from_thrift_helper(elements, index)?;
1171        index = t.0;
1172        schema_nodes.push(t.1);
1173    }
1174    if schema_nodes.len() != 1 {
1175        return Err(general_err!(
1176            "Expected exactly one root node, but found {}",
1177            schema_nodes.len()
1178        ));
1179    }
1180
1181    if !schema_nodes[0].is_group() {
1182        return Err(general_err!("Expected root node to be a group type"));
1183    }
1184
1185    Ok(schema_nodes.remove(0))
1186}
1187
1188/// Checks if the logical type is valid.
1189fn check_logical_type(logical_type: &Option<LogicalType>) -> Result<()> {
1190    if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type {
1191        if bit_width != 8 && bit_width != 16 && bit_width != 32 && bit_width != 64 {
1192            return Err(general_err!(
1193                "Bit width must be 8, 16, 32, or 64 for Integer logical type"
1194            ));
1195        }
1196    }
1197    Ok(())
1198}
1199
1200/// Constructs a new Type from the `elements`, starting at index `index`.
1201/// The first result is the starting index for the next Type after this one. If it is
1202/// equal to `elements.len()`, then this Type is the last one.
1203/// The second result is the result Type.
1204fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> {
1205    // Whether or not the current node is root (message type).
1206    // There is only one message type node in the schema tree.
1207    let is_root_node = index == 0;
1208
1209    if index >= elements.len() {
1210        return Err(general_err!(
1211            "Index out of bound, index = {}, len = {}",
1212            index,
1213            elements.len()
1214        ));
1215    }
1216    let element = &elements[index];
1217
1218    // Check for empty schema
1219    if let (true, None | Some(0)) = (is_root_node, element.num_children) {
1220        let builder = Type::group_type_builder(&element.name);
1221        return Ok((index + 1, Arc::new(builder.build().unwrap())));
1222    }
1223
1224    let converted_type = ConvertedType::try_from(element.converted_type)?;
1225    // LogicalType is only present in v2 Parquet files. ConvertedType is always
1226    // populated, regardless of the version of the file (v1 or v2).
1227    let logical_type = element
1228        .logical_type
1229        .as_ref()
1230        .map(|value| LogicalType::from(value.clone()));
1231
1232    check_logical_type(&logical_type)?;
1233
1234    let field_id = elements[index].field_id;
1235    match elements[index].num_children {
1236        // From parquet-format:
1237        //   The children count is used to construct the nested relationship.
1238        //   This field is not set when the element is a primitive type
1239        // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we
1240        // have to handle this case too.
1241        None | Some(0) => {
1242            // primitive type
1243            if elements[index].repetition_type.is_none() {
1244                return Err(general_err!(
1245                    "Repetition level must be defined for a primitive type"
1246                ));
1247            }
1248            let repetition = Repetition::try_from(elements[index].repetition_type.unwrap())?;
1249            if let Some(type_) = elements[index].type_ {
1250                let physical_type = PhysicalType::try_from(type_)?;
1251                let length = elements[index].type_length.unwrap_or(-1);
1252                let scale = elements[index].scale.unwrap_or(-1);
1253                let precision = elements[index].precision.unwrap_or(-1);
1254                let name = &elements[index].name;
1255                let builder = Type::primitive_type_builder(name, physical_type)
1256                    .with_repetition(repetition)
1257                    .with_converted_type(converted_type)
1258                    .with_logical_type(logical_type)
1259                    .with_length(length)
1260                    .with_precision(precision)
1261                    .with_scale(scale)
1262                    .with_id(field_id);
1263                Ok((index + 1, Arc::new(builder.build()?)))
1264            } else {
1265                let mut builder = Type::group_type_builder(&elements[index].name)
1266                    .with_converted_type(converted_type)
1267                    .with_logical_type(logical_type)
1268                    .with_id(field_id);
1269                if !is_root_node {
1270                    // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1271                    // REPEATED for root node.
1272                    //
1273                    // We only set repetition for group types that are not top-level message
1274                    // type. According to parquet-format:
1275                    //   Root of the schema does not have a repetition_type.
1276                    //   All other types must have one.
1277                    builder = builder.with_repetition(repetition);
1278                }
1279                Ok((index + 1, Arc::new(builder.build().unwrap())))
1280            }
1281        }
1282        Some(n) => {
1283            let repetition = elements[index]
1284                .repetition_type
1285                .map(Repetition::try_from)
1286                .transpose()?;
1287
1288            let mut fields = vec![];
1289            let mut next_index = index + 1;
1290            for _ in 0..n {
1291                let child_result = from_thrift_helper(elements, next_index)?;
1292                next_index = child_result.0;
1293                fields.push(child_result.1);
1294            }
1295
1296            let mut builder = Type::group_type_builder(&elements[index].name)
1297                .with_converted_type(converted_type)
1298                .with_logical_type(logical_type)
1299                .with_fields(fields)
1300                .with_id(field_id);
1301            if let Some(rep) = repetition {
1302                // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1303                // REPEATED for root node.
1304                //
1305                // We only set repetition for group types that are not top-level message
1306                // type. According to parquet-format:
1307                //   Root of the schema does not have a repetition_type.
1308                //   All other types must have one.
1309                if !is_root_node {
1310                    builder = builder.with_repetition(rep);
1311                }
1312            }
1313            Ok((next_index, Arc::new(builder.build().unwrap())))
1314        }
1315    }
1316}
1317
1318/// Method to convert to Thrift.
1319pub fn to_thrift(schema: &Type) -> Result<Vec<SchemaElement>> {
1320    if !schema.is_group() {
1321        return Err(general_err!("Root schema must be Group type"));
1322    }
1323    let mut elements: Vec<SchemaElement> = Vec::new();
1324    to_thrift_helper(schema, &mut elements);
1325    Ok(elements)
1326}
1327
1328/// Constructs list of `SchemaElement` from the schema using depth-first traversal.
1329/// Here we assume that schema is always valid and starts with group type.
1330fn to_thrift_helper(schema: &Type, elements: &mut Vec<SchemaElement>) {
1331    match *schema {
1332        Type::PrimitiveType {
1333            ref basic_info,
1334            physical_type,
1335            type_length,
1336            scale,
1337            precision,
1338        } => {
1339            let element = SchemaElement {
1340                type_: Some(physical_type.into()),
1341                type_length: if type_length >= 0 {
1342                    Some(type_length)
1343                } else {
1344                    None
1345                },
1346                repetition_type: Some(basic_info.repetition().into()),
1347                name: basic_info.name().to_owned(),
1348                num_children: None,
1349                converted_type: basic_info.converted_type().into(),
1350                scale: if scale >= 0 { Some(scale) } else { None },
1351                precision: if precision >= 0 {
1352                    Some(precision)
1353                } else {
1354                    None
1355                },
1356                field_id: if basic_info.has_id() {
1357                    Some(basic_info.id())
1358                } else {
1359                    None
1360                },
1361                logical_type: basic_info.logical_type().map(|value| value.into()),
1362            };
1363
1364            elements.push(element);
1365        }
1366        Type::GroupType {
1367            ref basic_info,
1368            ref fields,
1369        } => {
1370            let repetition = if basic_info.has_repetition() {
1371                Some(basic_info.repetition().into())
1372            } else {
1373                None
1374            };
1375
1376            let element = SchemaElement {
1377                type_: None,
1378                type_length: None,
1379                repetition_type: repetition,
1380                name: basic_info.name().to_owned(),
1381                num_children: Some(fields.len() as i32),
1382                converted_type: basic_info.converted_type().into(),
1383                scale: None,
1384                precision: None,
1385                field_id: if basic_info.has_id() {
1386                    Some(basic_info.id())
1387                } else {
1388                    None
1389                },
1390                logical_type: basic_info.logical_type().map(|value| value.into()),
1391            };
1392
1393            elements.push(element);
1394
1395            // Add child elements for a group
1396            for field in fields {
1397                to_thrift_helper(field, elements);
1398            }
1399        }
1400    }
1401}
1402
1403#[cfg(test)]
1404mod tests {
1405    use super::*;
1406
1407    use crate::schema::parser::parse_message_type;
1408
1409    // TODO: add tests for v2 types
1410
1411    #[test]
1412    fn test_primitive_type() {
1413        let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1414            .with_logical_type(Some(LogicalType::Integer {
1415                bit_width: 32,
1416                is_signed: true,
1417            }))
1418            .with_id(Some(0))
1419            .build();
1420        assert!(result.is_ok());
1421
1422        if let Ok(tp) = result {
1423            assert!(tp.is_primitive());
1424            assert!(!tp.is_group());
1425            let basic_info = tp.get_basic_info();
1426            assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1427            assert_eq!(
1428                basic_info.logical_type(),
1429                Some(LogicalType::Integer {
1430                    bit_width: 32,
1431                    is_signed: true
1432                })
1433            );
1434            assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1435            assert_eq!(basic_info.id(), 0);
1436            match tp {
1437                Type::PrimitiveType { physical_type, .. } => {
1438                    assert_eq!(physical_type, PhysicalType::INT32);
1439                }
1440                _ => panic!(),
1441            }
1442        }
1443
1444        // Test illegal inputs with logical type
1445        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1446            .with_repetition(Repetition::REPEATED)
1447            .with_logical_type(Some(LogicalType::Integer {
1448                is_signed: true,
1449                bit_width: 8,
1450            }))
1451            .build();
1452        assert!(result.is_err());
1453        if let Err(e) = result {
1454            assert_eq!(
1455                format!("{e}"),
1456                "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'"
1457            );
1458        }
1459
1460        // Test illegal inputs with converted type
1461        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1462            .with_repetition(Repetition::REPEATED)
1463            .with_converted_type(ConvertedType::BSON)
1464            .build();
1465        assert!(result.is_err());
1466        if let Err(e) = result {
1467            assert_eq!(
1468                format!("{e}"),
1469                "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1470            );
1471        }
1472
1473        result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1474            .with_repetition(Repetition::REQUIRED)
1475            .with_converted_type(ConvertedType::DECIMAL)
1476            .with_precision(-1)
1477            .with_scale(-1)
1478            .build();
1479        assert!(result.is_err());
1480        if let Err(e) = result {
1481            assert_eq!(
1482                format!("{e}"),
1483                "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1484            );
1485        }
1486
1487        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1488            .with_repetition(Repetition::REQUIRED)
1489            .with_logical_type(Some(LogicalType::Decimal {
1490                scale: 32,
1491                precision: 12,
1492            }))
1493            .with_precision(-1)
1494            .with_scale(-1)
1495            .build();
1496        assert!(result.is_err());
1497        if let Err(e) = result {
1498            assert_eq!(
1499                format!("{e}"),
1500                "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1501            );
1502        }
1503
1504        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1505            .with_repetition(Repetition::REQUIRED)
1506            .with_converted_type(ConvertedType::DECIMAL)
1507            .with_precision(-1)
1508            .with_scale(-1)
1509            .build();
1510        assert!(result.is_err());
1511        if let Err(e) = result {
1512            assert_eq!(
1513                format!("{e}"),
1514                "Parquet error: Invalid DECIMAL precision: -1"
1515            );
1516        }
1517
1518        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1519            .with_repetition(Repetition::REQUIRED)
1520            .with_converted_type(ConvertedType::DECIMAL)
1521            .with_precision(0)
1522            .with_scale(-1)
1523            .build();
1524        assert!(result.is_err());
1525        if let Err(e) = result {
1526            assert_eq!(
1527                format!("{e}"),
1528                "Parquet error: Invalid DECIMAL precision: 0"
1529            );
1530        }
1531
1532        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1533            .with_repetition(Repetition::REQUIRED)
1534            .with_converted_type(ConvertedType::DECIMAL)
1535            .with_precision(1)
1536            .with_scale(-1)
1537            .build();
1538        assert!(result.is_err());
1539        if let Err(e) = result {
1540            assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1541        }
1542
1543        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1544            .with_repetition(Repetition::REQUIRED)
1545            .with_converted_type(ConvertedType::DECIMAL)
1546            .with_precision(1)
1547            .with_scale(2)
1548            .build();
1549        assert!(result.is_err());
1550        if let Err(e) = result {
1551            assert_eq!(
1552                format!("{e}"),
1553                "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1554            );
1555        }
1556
1557        // It is OK if precision == scale
1558        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1559            .with_repetition(Repetition::REQUIRED)
1560            .with_converted_type(ConvertedType::DECIMAL)
1561            .with_precision(1)
1562            .with_scale(1)
1563            .build();
1564        assert!(result.is_ok());
1565
1566        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1567            .with_repetition(Repetition::REQUIRED)
1568            .with_converted_type(ConvertedType::DECIMAL)
1569            .with_precision(18)
1570            .with_scale(2)
1571            .build();
1572        assert!(result.is_err());
1573        if let Err(e) = result {
1574            assert_eq!(
1575                format!("{e}"),
1576                "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1577            );
1578        }
1579
1580        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1581            .with_repetition(Repetition::REQUIRED)
1582            .with_converted_type(ConvertedType::DECIMAL)
1583            .with_precision(32)
1584            .with_scale(2)
1585            .build();
1586        assert!(result.is_err());
1587        if let Err(e) = result {
1588            assert_eq!(
1589                format!("{e}"),
1590                "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1591            );
1592        }
1593
1594        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1595            .with_repetition(Repetition::REQUIRED)
1596            .with_converted_type(ConvertedType::DECIMAL)
1597            .with_length(5)
1598            .with_precision(12)
1599            .with_scale(2)
1600            .build();
1601        assert!(result.is_err());
1602        if let Err(e) = result {
1603            assert_eq!(
1604                format!("{e}"),
1605                "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1606            );
1607        }
1608
1609        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1610            .with_repetition(Repetition::REQUIRED)
1611            .with_converted_type(ConvertedType::UINT_8)
1612            .build();
1613        assert!(result.is_err());
1614        if let Err(e) = result {
1615            assert_eq!(
1616                format!("{e}"),
1617                "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1618            );
1619        }
1620
1621        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1622            .with_repetition(Repetition::REQUIRED)
1623            .with_converted_type(ConvertedType::TIME_MICROS)
1624            .build();
1625        assert!(result.is_err());
1626        if let Err(e) = result {
1627            assert_eq!(
1628                format!("{e}"),
1629                "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1630            );
1631        }
1632
1633        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1634            .with_repetition(Repetition::REQUIRED)
1635            .with_converted_type(ConvertedType::INTERVAL)
1636            .build();
1637        assert!(result.is_err());
1638        if let Err(e) = result {
1639            assert_eq!(
1640                format!("{e}"),
1641                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1642            );
1643        }
1644
1645        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1646            .with_repetition(Repetition::REQUIRED)
1647            .with_converted_type(ConvertedType::INTERVAL)
1648            .with_length(1)
1649            .build();
1650        assert!(result.is_err());
1651        if let Err(e) = result {
1652            assert_eq!(
1653                format!("{e}"),
1654                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1655            );
1656        }
1657
1658        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1659            .with_repetition(Repetition::REQUIRED)
1660            .with_converted_type(ConvertedType::ENUM)
1661            .build();
1662        assert!(result.is_err());
1663        if let Err(e) = result {
1664            assert_eq!(
1665                format!("{e}"),
1666                "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1667            );
1668        }
1669
1670        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1671            .with_repetition(Repetition::REQUIRED)
1672            .with_converted_type(ConvertedType::MAP)
1673            .build();
1674        assert!(result.is_err());
1675        if let Err(e) = result {
1676            assert_eq!(
1677                format!("{e}"),
1678                "Parquet error: MAP cannot be applied to primitive field 'foo'"
1679            );
1680        }
1681
1682        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1683            .with_repetition(Repetition::REQUIRED)
1684            .with_converted_type(ConvertedType::DECIMAL)
1685            .with_length(-1)
1686            .build();
1687        assert!(result.is_err());
1688        if let Err(e) = result {
1689            assert_eq!(
1690                format!("{e}"),
1691                "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1692            );
1693        }
1694
1695        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1696            .with_repetition(Repetition::REQUIRED)
1697            .with_logical_type(Some(LogicalType::Float16))
1698            .with_length(2)
1699            .build();
1700        assert!(result.is_ok());
1701
1702        // Can't be other than FIXED_LEN_BYTE_ARRAY for physical type
1703        result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1704            .with_repetition(Repetition::REQUIRED)
1705            .with_logical_type(Some(LogicalType::Float16))
1706            .with_length(2)
1707            .build();
1708        assert!(result.is_err());
1709        if let Err(e) = result {
1710            assert_eq!(
1711                format!("{e}"),
1712                "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1713            );
1714        }
1715
1716        // Must have length 2
1717        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1718            .with_repetition(Repetition::REQUIRED)
1719            .with_logical_type(Some(LogicalType::Float16))
1720            .with_length(4)
1721            .build();
1722        assert!(result.is_err());
1723        if let Err(e) = result {
1724            assert_eq!(
1725                format!("{e}"),
1726                "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1727            );
1728        }
1729
1730        // Must have length 16
1731        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1732            .with_repetition(Repetition::REQUIRED)
1733            .with_logical_type(Some(LogicalType::Uuid))
1734            .with_length(15)
1735            .build();
1736        assert!(result.is_err());
1737        if let Err(e) = result {
1738            assert_eq!(
1739                format!("{e}"),
1740                "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1741            );
1742        }
1743    }
1744
1745    #[test]
1746    fn test_group_type() {
1747        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1748            .with_converted_type(ConvertedType::INT_32)
1749            .with_id(Some(0))
1750            .build();
1751        assert!(f1.is_ok());
1752        let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1753            .with_converted_type(ConvertedType::UTF8)
1754            .with_id(Some(1))
1755            .build();
1756        assert!(f2.is_ok());
1757
1758        let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1759
1760        let result = Type::group_type_builder("foo")
1761            .with_repetition(Repetition::REPEATED)
1762            .with_logical_type(Some(LogicalType::List))
1763            .with_fields(fields)
1764            .with_id(Some(1))
1765            .build();
1766        assert!(result.is_ok());
1767
1768        let tp = result.unwrap();
1769        let basic_info = tp.get_basic_info();
1770        assert!(tp.is_group());
1771        assert!(!tp.is_primitive());
1772        assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1773        assert_eq!(basic_info.logical_type(), Some(LogicalType::List));
1774        assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1775        assert_eq!(basic_info.id(), 1);
1776        assert_eq!(tp.get_fields().len(), 2);
1777        assert_eq!(tp.get_fields()[0].name(), "f1");
1778        assert_eq!(tp.get_fields()[1].name(), "f2");
1779    }
1780
1781    #[test]
1782    fn test_column_descriptor() {
1783        let result = test_column_descriptor_helper();
1784        assert!(
1785            result.is_ok(),
1786            "Expected result to be OK but got err:\n {}",
1787            result.unwrap_err()
1788        );
1789    }
1790
1791    fn test_column_descriptor_helper() -> Result<()> {
1792        let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1793            .with_converted_type(ConvertedType::UTF8)
1794            .build()?;
1795
1796        let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1797
1798        assert_eq!(descr.path(), &ColumnPath::from("name"));
1799        assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1800        assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1801        assert_eq!(descr.max_def_level(), 4);
1802        assert_eq!(descr.max_rep_level(), 1);
1803        assert_eq!(descr.name(), "name");
1804        assert_eq!(descr.type_length(), -1);
1805        assert_eq!(descr.type_precision(), -1);
1806        assert_eq!(descr.type_scale(), -1);
1807
1808        Ok(())
1809    }
1810
1811    #[test]
1812    fn test_schema_descriptor() {
1813        let result = test_schema_descriptor_helper();
1814        assert!(
1815            result.is_ok(),
1816            "Expected result to be OK but got err:\n {}",
1817            result.unwrap_err()
1818        );
1819    }
1820
1821    // A helper fn to avoid handling the results from type creation
1822    fn test_schema_descriptor_helper() -> Result<()> {
1823        let mut fields = vec![];
1824
1825        let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1826            .with_repetition(Repetition::REQUIRED)
1827            .with_converted_type(ConvertedType::INT_32)
1828            .build()?;
1829        fields.push(Arc::new(inta));
1830        let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1831            .with_converted_type(ConvertedType::INT_64)
1832            .build()?;
1833        fields.push(Arc::new(intb));
1834        let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1835            .with_repetition(Repetition::REPEATED)
1836            .with_converted_type(ConvertedType::UTF8)
1837            .build()?;
1838        fields.push(Arc::new(intc));
1839
1840        // 3-level list encoding
1841        let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1842            .with_repetition(Repetition::REQUIRED)
1843            .with_converted_type(ConvertedType::INT_64)
1844            .build()?;
1845        let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1846        let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1847            .with_repetition(Repetition::REPEATED)
1848            .with_converted_type(ConvertedType::INT_32)
1849            .build()?;
1850        let list = Type::group_type_builder("records")
1851            .with_repetition(Repetition::REPEATED)
1852            .with_converted_type(ConvertedType::LIST)
1853            .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1854            .build()?;
1855        let bag = Type::group_type_builder("bag")
1856            .with_repetition(Repetition::OPTIONAL)
1857            .with_fields(vec![Arc::new(list)])
1858            .build()?;
1859        fields.push(Arc::new(bag));
1860
1861        let schema = Type::group_type_builder("schema")
1862            .with_repetition(Repetition::REPEATED)
1863            .with_fields(fields)
1864            .build()?;
1865        let descr = SchemaDescriptor::new(Arc::new(schema));
1866
1867        let nleaves = 6;
1868        assert_eq!(descr.num_columns(), nleaves);
1869
1870        //                             mdef mrep
1871        // required int32 a            0    0
1872        // optional int64 b            1    0
1873        // repeated byte_array c       1    1
1874        // optional group bag          1    0
1875        //   repeated group records    2    1
1876        //     required int64 item1    2    1
1877        //     optional boolean item2  3    1
1878        //     repeated int32 item3    3    2
1879        let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1880        let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1881
1882        for i in 0..nleaves {
1883            let col = descr.column(i);
1884            assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1885            assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1886        }
1887
1888        assert_eq!(descr.column(0).path().string(), "a");
1889        assert_eq!(descr.column(1).path().string(), "b");
1890        assert_eq!(descr.column(2).path().string(), "c");
1891        assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1892        assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1893        assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1894
1895        assert_eq!(descr.get_column_root(0).name(), "a");
1896        assert_eq!(descr.get_column_root(3).name(), "bag");
1897        assert_eq!(descr.get_column_root(4).name(), "bag");
1898        assert_eq!(descr.get_column_root(5).name(), "bag");
1899
1900        Ok(())
1901    }
1902
1903    #[test]
1904    fn test_schema_build_tree_def_rep_levels() {
1905        let message_type = "
1906    message spark_schema {
1907      REQUIRED INT32 a;
1908      OPTIONAL group b {
1909        OPTIONAL INT32 _1;
1910        OPTIONAL INT32 _2;
1911      }
1912      OPTIONAL group c (LIST) {
1913        REPEATED group list {
1914          OPTIONAL INT32 element;
1915        }
1916      }
1917    }
1918    ";
1919        let schema = parse_message_type(message_type).expect("should parse schema");
1920        let descr = SchemaDescriptor::new(Arc::new(schema));
1921        // required int32 a
1922        assert_eq!(descr.column(0).max_def_level(), 0);
1923        assert_eq!(descr.column(0).max_rep_level(), 0);
1924        // optional int32 b._1
1925        assert_eq!(descr.column(1).max_def_level(), 2);
1926        assert_eq!(descr.column(1).max_rep_level(), 0);
1927        // optional int32 b._2
1928        assert_eq!(descr.column(2).max_def_level(), 2);
1929        assert_eq!(descr.column(2).max_rep_level(), 0);
1930        // repeated optional int32 c.list.element
1931        assert_eq!(descr.column(3).max_def_level(), 3);
1932        assert_eq!(descr.column(3).max_rep_level(), 1);
1933    }
1934
1935    #[test]
1936    #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
1937    fn test_get_physical_type_panic() {
1938        let list = Type::group_type_builder("records")
1939            .with_repetition(Repetition::REPEATED)
1940            .build()
1941            .unwrap();
1942        list.get_physical_type();
1943    }
1944
1945    #[test]
1946    fn test_get_physical_type_primitive() {
1947        let f = Type::primitive_type_builder("f", PhysicalType::INT64)
1948            .build()
1949            .unwrap();
1950        assert_eq!(f.get_physical_type(), PhysicalType::INT64);
1951
1952        let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
1953            .build()
1954            .unwrap();
1955        assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
1956    }
1957
1958    #[test]
1959    fn test_check_contains_primitive_primitive() {
1960        // OK
1961        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1962            .build()
1963            .unwrap();
1964        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1965            .build()
1966            .unwrap();
1967        assert!(f1.check_contains(&f2));
1968
1969        // OK: different logical type does not affect check_contains
1970        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1971            .with_converted_type(ConvertedType::UINT_8)
1972            .build()
1973            .unwrap();
1974        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1975            .with_converted_type(ConvertedType::UINT_16)
1976            .build()
1977            .unwrap();
1978        assert!(f1.check_contains(&f2));
1979
1980        // KO: different name
1981        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1982            .build()
1983            .unwrap();
1984        let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
1985            .build()
1986            .unwrap();
1987        assert!(!f1.check_contains(&f2));
1988
1989        // KO: different type
1990        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1991            .build()
1992            .unwrap();
1993        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
1994            .build()
1995            .unwrap();
1996        assert!(!f1.check_contains(&f2));
1997
1998        // KO: different repetition
1999        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2000            .with_repetition(Repetition::REQUIRED)
2001            .build()
2002            .unwrap();
2003        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2004            .with_repetition(Repetition::OPTIONAL)
2005            .build()
2006            .unwrap();
2007        assert!(!f1.check_contains(&f2));
2008    }
2009
2010    // function to create a new group type for testing
2011    fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
2012        Type::group_type_builder(name)
2013            .with_repetition(repetition)
2014            .with_fields(types.into_iter().map(Arc::new).collect())
2015            .build()
2016            .unwrap()
2017    }
2018
2019    #[test]
2020    fn test_check_contains_group_group() {
2021        // OK: should match okay with empty fields
2022        let f1 = Type::group_type_builder("f").build().unwrap();
2023        let f2 = Type::group_type_builder("f").build().unwrap();
2024        assert!(f1.check_contains(&f2));
2025        assert!(!f1.is_optional());
2026
2027        // OK: fields match
2028        let f1 = test_new_group_type(
2029            "f",
2030            Repetition::REPEATED,
2031            vec![
2032                Type::primitive_type_builder("f1", PhysicalType::INT32)
2033                    .build()
2034                    .unwrap(),
2035                Type::primitive_type_builder("f2", PhysicalType::INT64)
2036                    .build()
2037                    .unwrap(),
2038            ],
2039        );
2040        let f2 = test_new_group_type(
2041            "f",
2042            Repetition::REPEATED,
2043            vec![
2044                Type::primitive_type_builder("f1", PhysicalType::INT32)
2045                    .build()
2046                    .unwrap(),
2047                Type::primitive_type_builder("f2", PhysicalType::INT64)
2048                    .build()
2049                    .unwrap(),
2050            ],
2051        );
2052        assert!(f1.check_contains(&f2));
2053
2054        // OK: subset of fields
2055        let f1 = test_new_group_type(
2056            "f",
2057            Repetition::REPEATED,
2058            vec![
2059                Type::primitive_type_builder("f1", PhysicalType::INT32)
2060                    .build()
2061                    .unwrap(),
2062                Type::primitive_type_builder("f2", PhysicalType::INT64)
2063                    .build()
2064                    .unwrap(),
2065            ],
2066        );
2067        let f2 = test_new_group_type(
2068            "f",
2069            Repetition::REPEATED,
2070            vec![Type::primitive_type_builder("f2", PhysicalType::INT64)
2071                .build()
2072                .unwrap()],
2073        );
2074        assert!(f1.check_contains(&f2));
2075
2076        // KO: different name
2077        let f1 = Type::group_type_builder("f1").build().unwrap();
2078        let f2 = Type::group_type_builder("f2").build().unwrap();
2079        assert!(!f1.check_contains(&f2));
2080
2081        // KO: different repetition
2082        let f1 = Type::group_type_builder("f")
2083            .with_repetition(Repetition::OPTIONAL)
2084            .build()
2085            .unwrap();
2086        let f2 = Type::group_type_builder("f")
2087            .with_repetition(Repetition::REPEATED)
2088            .build()
2089            .unwrap();
2090        assert!(!f1.check_contains(&f2));
2091
2092        // KO: different fields
2093        let f1 = test_new_group_type(
2094            "f",
2095            Repetition::REPEATED,
2096            vec![
2097                Type::primitive_type_builder("f1", PhysicalType::INT32)
2098                    .build()
2099                    .unwrap(),
2100                Type::primitive_type_builder("f2", PhysicalType::INT64)
2101                    .build()
2102                    .unwrap(),
2103            ],
2104        );
2105        let f2 = test_new_group_type(
2106            "f",
2107            Repetition::REPEATED,
2108            vec![
2109                Type::primitive_type_builder("f1", PhysicalType::INT32)
2110                    .build()
2111                    .unwrap(),
2112                Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2113                    .build()
2114                    .unwrap(),
2115            ],
2116        );
2117        assert!(!f1.check_contains(&f2));
2118
2119        // KO: different fields
2120        let f1 = test_new_group_type(
2121            "f",
2122            Repetition::REPEATED,
2123            vec![
2124                Type::primitive_type_builder("f1", PhysicalType::INT32)
2125                    .build()
2126                    .unwrap(),
2127                Type::primitive_type_builder("f2", PhysicalType::INT64)
2128                    .build()
2129                    .unwrap(),
2130            ],
2131        );
2132        let f2 = test_new_group_type(
2133            "f",
2134            Repetition::REPEATED,
2135            vec![Type::primitive_type_builder("f3", PhysicalType::INT32)
2136                .build()
2137                .unwrap()],
2138        );
2139        assert!(!f1.check_contains(&f2));
2140    }
2141
2142    #[test]
2143    fn test_check_contains_group_primitive() {
2144        // KO: should not match
2145        let f1 = Type::group_type_builder("f").build().unwrap();
2146        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2147            .build()
2148            .unwrap();
2149        assert!(!f1.check_contains(&f2));
2150        assert!(!f2.check_contains(&f1));
2151
2152        // KO: should not match when primitive field is part of group type
2153        let f1 = test_new_group_type(
2154            "f",
2155            Repetition::REPEATED,
2156            vec![Type::primitive_type_builder("f1", PhysicalType::INT32)
2157                .build()
2158                .unwrap()],
2159        );
2160        let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2161            .build()
2162            .unwrap();
2163        assert!(!f1.check_contains(&f2));
2164        assert!(!f2.check_contains(&f1));
2165
2166        // OK: match nested types
2167        let f1 = test_new_group_type(
2168            "a",
2169            Repetition::REPEATED,
2170            vec![
2171                test_new_group_type(
2172                    "b",
2173                    Repetition::REPEATED,
2174                    vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2175                        .build()
2176                        .unwrap()],
2177                ),
2178                Type::primitive_type_builder("d", PhysicalType::INT64)
2179                    .build()
2180                    .unwrap(),
2181                Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2182                    .build()
2183                    .unwrap(),
2184            ],
2185        );
2186        let f2 = test_new_group_type(
2187            "a",
2188            Repetition::REPEATED,
2189            vec![test_new_group_type(
2190                "b",
2191                Repetition::REPEATED,
2192                vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2193                    .build()
2194                    .unwrap()],
2195            )],
2196        );
2197        assert!(f1.check_contains(&f2)); // should match
2198        assert!(!f2.check_contains(&f1)); // should fail
2199    }
2200
2201    #[test]
2202    fn test_schema_type_thrift_conversion_err() {
2203        let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2204            .build()
2205            .unwrap();
2206        let thrift_schema = to_thrift(&schema);
2207        assert!(thrift_schema.is_err());
2208        if let Err(e) = thrift_schema {
2209            assert_eq!(
2210                format!("{e}"),
2211                "Parquet error: Root schema must be Group type"
2212            );
2213        }
2214    }
2215
2216    #[test]
2217    fn test_schema_type_thrift_conversion() {
2218        let message_type = "
2219    message conversions {
2220      REQUIRED INT64 id;
2221      OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2222      OPTIONAL group int_array_Array (LIST) {
2223        REPEATED group list {
2224          OPTIONAL group element (LIST) {
2225            REPEATED group list {
2226              OPTIONAL INT32 element;
2227            }
2228          }
2229        }
2230      }
2231      OPTIONAL group int_map (MAP) {
2232        REPEATED group map (MAP_KEY_VALUE) {
2233          REQUIRED BYTE_ARRAY key (UTF8);
2234          OPTIONAL INT32 value;
2235        }
2236      }
2237      OPTIONAL group int_Map_Array (LIST) {
2238        REPEATED group list {
2239          OPTIONAL group g (MAP) {
2240            REPEATED group map (MAP_KEY_VALUE) {
2241              REQUIRED BYTE_ARRAY key (UTF8);
2242              OPTIONAL group value {
2243                OPTIONAL group H {
2244                  OPTIONAL group i (LIST) {
2245                    REPEATED group list {
2246                      OPTIONAL DOUBLE element;
2247                    }
2248                  }
2249                }
2250              }
2251            }
2252          }
2253        }
2254      }
2255      OPTIONAL group nested_struct {
2256        OPTIONAL INT32 A;
2257        OPTIONAL group b (LIST) {
2258          REPEATED group list {
2259            REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2260          }
2261        }
2262      }
2263    }
2264    ";
2265        let expected_schema = parse_message_type(message_type).unwrap();
2266        let thrift_schema = to_thrift(&expected_schema).unwrap();
2267        let result_schema = from_thrift(&thrift_schema).unwrap();
2268        assert_eq!(result_schema, Arc::new(expected_schema));
2269    }
2270
2271    #[test]
2272    fn test_schema_type_thrift_conversion_decimal() {
2273        let message_type = "
2274    message decimals {
2275      OPTIONAL INT32 field0;
2276      OPTIONAL INT64 field1 (DECIMAL (18, 2));
2277      OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2278      OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2279    }
2280    ";
2281        let expected_schema = parse_message_type(message_type).unwrap();
2282        let thrift_schema = to_thrift(&expected_schema).unwrap();
2283        let result_schema = from_thrift(&thrift_schema).unwrap();
2284        assert_eq!(result_schema, Arc::new(expected_schema));
2285    }
2286
2287    // Tests schema conversion from thrift, when num_children is set to Some(0) for a
2288    // primitive type.
2289    #[test]
2290    fn test_schema_from_thrift_with_num_children_set() {
2291        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2292        let message_type = "
2293    message schema {
2294      OPTIONAL BYTE_ARRAY id (UTF8);
2295      OPTIONAL BYTE_ARRAY name (UTF8);
2296      OPTIONAL BYTE_ARRAY message (UTF8);
2297      OPTIONAL INT32 type (UINT_8);
2298      OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2299      OPTIONAL INT64 __index_level_0__;
2300    }
2301    ";
2302
2303        let expected_schema = parse_message_type(message_type).unwrap();
2304        let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2305        // Change all of None to Some(0)
2306        for elem in &mut thrift_schema[..] {
2307            if elem.num_children.is_none() {
2308                elem.num_children = Some(0);
2309            }
2310        }
2311
2312        let result_schema = from_thrift(&thrift_schema).unwrap();
2313        assert_eq!(result_schema, Arc::new(expected_schema));
2314    }
2315
2316    // Sometimes parquet-cpp sets repetition level for the root node, which is against
2317    // the format definition, but we need to handle it by setting it back to None.
2318    #[test]
2319    fn test_schema_from_thrift_root_has_repetition() {
2320        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2321        let message_type = "
2322    message schema {
2323      OPTIONAL BYTE_ARRAY a (UTF8);
2324      OPTIONAL INT32 b (UINT_8);
2325    }
2326    ";
2327
2328        let expected_schema = parse_message_type(message_type).unwrap();
2329        let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2330        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2331
2332        let result_schema = from_thrift(&thrift_schema).unwrap();
2333        assert_eq!(result_schema, Arc::new(expected_schema));
2334    }
2335
2336    #[test]
2337    fn test_schema_from_thrift_group_has_no_child() {
2338        let message_type = "message schema {}";
2339
2340        let expected_schema = parse_message_type(message_type).unwrap();
2341        let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2342        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2343
2344        let result_schema = from_thrift(&thrift_schema).unwrap();
2345        assert_eq!(result_schema, Arc::new(expected_schema));
2346    }
2347}