Skip to main content

parquet/schema/
types.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains structs and methods to build Parquet schema and schema descriptors.
19
20use std::vec::IntoIter;
21use std::{collections::HashMap, fmt, sync::Arc};
22
23use crate::file::metadata::HeapSize;
24use crate::file::metadata::thrift::SchemaElement;
25
26use crate::basic::{
27    ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType,
28};
29use crate::errors::{ParquetError, Result};
30
31// ----------------------------------------------------------------------
32// Parquet Type definitions
33
34/// Type alias for `Arc<Type>`.
35pub type TypePtr = Arc<Type>;
36/// Type alias for `Arc<SchemaDescriptor>`.
37pub type SchemaDescPtr = Arc<SchemaDescriptor>;
38/// Type alias for `Arc<ColumnDescriptor>`.
39pub type ColumnDescPtr = Arc<ColumnDescriptor>;
40
41/// Representation of a Parquet type.
42///
43/// Used to describe primitive leaf fields and structs, including top-level schema.
44///
45/// Note that the top-level schema is represented using [`Type::GroupType`] whose
46/// repetition is `None`.
47#[derive(Clone, Debug, PartialEq)]
48pub enum Type {
49    /// Represents a primitive leaf field.
50    PrimitiveType {
51        /// Basic information about the type.
52        basic_info: BasicTypeInfo,
53        /// Physical type of this primitive type.
54        physical_type: PhysicalType,
55        /// Length of this type.
56        type_length: i32,
57        /// Scale of this type.
58        scale: i32,
59        /// Precision of this type.
60        precision: i32,
61    },
62    /// Represents a group of fields (similar to struct).
63    GroupType {
64        /// Basic information about the type.
65        basic_info: BasicTypeInfo,
66        /// Fields of this group type.
67        fields: Vec<TypePtr>,
68    },
69}
70
71impl HeapSize for Type {
72    fn heap_size(&self) -> usize {
73        match self {
74            Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
75            Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
76        }
77    }
78}
79
80impl Type {
81    /// Creates primitive type builder with provided field name and physical type.
82    pub fn primitive_type_builder(
83        name: &str,
84        physical_type: PhysicalType,
85    ) -> PrimitiveTypeBuilder<'_> {
86        PrimitiveTypeBuilder::new(name, physical_type)
87    }
88
89    /// Creates group type builder with provided column name.
90    pub fn group_type_builder(name: &str) -> GroupTypeBuilder<'_> {
91        GroupTypeBuilder::new(name)
92    }
93
94    /// Returns [`BasicTypeInfo`] information about the type.
95    pub fn get_basic_info(&self) -> &BasicTypeInfo {
96        match *self {
97            Type::PrimitiveType { ref basic_info, .. } => basic_info,
98            Type::GroupType { ref basic_info, .. } => basic_info,
99        }
100    }
101
102    /// Returns this type's field name.
103    pub fn name(&self) -> &str {
104        self.get_basic_info().name()
105    }
106
107    /// Gets the fields from this group type.
108    /// Note that this will panic if called on a non-group type.
109    // TODO: should we return `&[&Type]` here?
110    pub fn get_fields(&self) -> &[TypePtr] {
111        match *self {
112            Type::GroupType { ref fields, .. } => &fields[..],
113            _ => panic!("Cannot call get_fields() on a non-group type"),
114        }
115    }
116
117    /// Gets physical type of this primitive type.
118    /// Note that this will panic if called on a non-primitive type.
119    pub fn get_physical_type(&self) -> PhysicalType {
120        match *self {
121            Type::PrimitiveType {
122                basic_info: _,
123                physical_type,
124                ..
125            } => physical_type,
126            _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
127        }
128    }
129
130    /// Gets precision of this primitive type.
131    /// Note that this will panic if called on a non-primitive type.
132    pub fn get_precision(&self) -> i32 {
133        match *self {
134            Type::PrimitiveType { precision, .. } => precision,
135            _ => panic!("Cannot call get_precision() on non-primitive type"),
136        }
137    }
138
139    /// Gets scale of this primitive type.
140    /// Note that this will panic if called on a non-primitive type.
141    pub fn get_scale(&self) -> i32 {
142        match *self {
143            Type::PrimitiveType { scale, .. } => scale,
144            _ => panic!("Cannot call get_scale() on non-primitive type"),
145        }
146    }
147
148    /// Checks if `sub_type` schema is part of current schema.
149    /// This method can be used to check if projected columns are part of the root schema.
150    pub fn check_contains(&self, sub_type: &Type) -> bool {
151        // Names match, and repetitions match or not set for both
152        let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
153            && (self.is_schema() && sub_type.is_schema()
154                || !self.is_schema()
155                    && !sub_type.is_schema()
156                    && self.get_basic_info().repetition()
157                        == sub_type.get_basic_info().repetition());
158
159        match *self {
160            Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
161                self.get_physical_type() == sub_type.get_physical_type()
162            }
163            Type::GroupType { .. } if basic_match && sub_type.is_group() => {
164                // build hashmap of name -> TypePtr
165                let mut field_map = HashMap::new();
166                for field in self.get_fields() {
167                    field_map.insert(field.name(), field);
168                }
169
170                for field in sub_type.get_fields() {
171                    if !field_map
172                        .get(field.name())
173                        .map(|tpe| tpe.check_contains(field))
174                        .unwrap_or(false)
175                    {
176                        return false;
177                    }
178                }
179                true
180            }
181            _ => false,
182        }
183    }
184
185    /// Returns `true` if this type is a primitive type, `false` otherwise.
186    pub fn is_primitive(&self) -> bool {
187        matches!(*self, Type::PrimitiveType { .. })
188    }
189
190    /// Returns `true` if this type is a group type, `false` otherwise.
191    pub fn is_group(&self) -> bool {
192        matches!(*self, Type::GroupType { .. })
193    }
194
195    /// Returns `true` if this type is the top-level schema type (message type).
196    pub fn is_schema(&self) -> bool {
197        match *self {
198            Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
199            _ => false,
200        }
201    }
202
203    /// Returns `true` if this type is repeated or optional.
204    /// If this type doesn't have repetition defined, we treat it as required.
205    pub fn is_optional(&self) -> bool {
206        self.get_basic_info().has_repetition()
207            && self.get_basic_info().repetition() != Repetition::REQUIRED
208    }
209
210    /// Returns `true` if this type is annotated as a list.
211    pub(crate) fn is_list(&self) -> bool {
212        if self.is_group() {
213            let basic_info = self.get_basic_info();
214            if let Some(logical_type) = basic_info.logical_type_ref() {
215                return logical_type == &LogicalType::List;
216            }
217            return basic_info.converted_type() == ConvertedType::LIST;
218        }
219        false
220    }
221
222    /// Returns `true` if this type is a group with a single child field that is `repeated`.
223    pub(crate) fn has_single_repeated_child(&self) -> bool {
224        if self.is_group() {
225            let children = self.get_fields();
226            return children.len() == 1
227                && children[0].get_basic_info().has_repetition()
228                && children[0].get_basic_info().repetition() == Repetition::REPEATED;
229        }
230        false
231    }
232}
233
234/// A builder for primitive types. All attributes are optional
235/// except the name and physical type.
236/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used.
237pub struct PrimitiveTypeBuilder<'a> {
238    name: &'a str,
239    repetition: Repetition,
240    physical_type: PhysicalType,
241    converted_type: ConvertedType,
242    logical_type: Option<LogicalType>,
243    length: i32,
244    precision: i32,
245    scale: i32,
246    id: Option<i32>,
247}
248
249impl<'a> PrimitiveTypeBuilder<'a> {
250    /// Creates new primitive type builder with provided field name and physical type.
251    pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
252        Self {
253            name,
254            repetition: Repetition::OPTIONAL,
255            physical_type,
256            converted_type: ConvertedType::NONE,
257            logical_type: None,
258            length: -1,
259            precision: -1,
260            scale: -1,
261            id: None,
262        }
263    }
264
265    /// Sets [`Repetition`] for this field and returns itself.
266    pub fn with_repetition(self, repetition: Repetition) -> Self {
267        Self { repetition, ..self }
268    }
269
270    /// Sets [`ConvertedType`] for this field and returns itself.
271    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
272        Self {
273            converted_type,
274            ..self
275        }
276    }
277
278    /// Sets [`LogicalType`] for this field and returns itself.
279    /// If only the logical type is populated for a primitive type, the converted type
280    /// will be automatically populated, and can thus be omitted.
281    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
282        Self {
283            logical_type,
284            ..self
285        }
286    }
287
288    /// Sets type length and returns itself.
289    /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because
290    /// they maintain fixed size underlying byte array.
291    /// By default, value is `0`.
292    pub fn with_length(self, length: i32) -> Self {
293        Self { length, ..self }
294    }
295
296    /// Sets precision for Parquet DECIMAL physical type and returns itself.
297    /// By default, it equals to `0` and used only for decimal context.
298    pub fn with_precision(self, precision: i32) -> Self {
299        Self { precision, ..self }
300    }
301
302    /// Sets scale for Parquet DECIMAL physical type and returns itself.
303    /// By default, it equals to `0` and used only for decimal context.
304    pub fn with_scale(self, scale: i32) -> Self {
305        Self { scale, ..self }
306    }
307
308    /// Sets optional field id and returns itself.
309    pub fn with_id(self, id: Option<i32>) -> Self {
310        Self { id, ..self }
311    }
312
313    /// Creates a new `PrimitiveType` instance from the collected attributes.
314    /// Returns `Err` in case of any building conditions are not met.
315    pub fn build(self) -> Result<Type> {
316        let mut basic_info = BasicTypeInfo {
317            name: String::from(self.name),
318            repetition: Some(self.repetition),
319            converted_type: self.converted_type,
320            logical_type: self.logical_type.clone(),
321            id: self.id,
322        };
323
324        // Check length before logical type, since it is used for logical type validation.
325        if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
326            return Err(general_err!(
327                "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
328                self.length,
329                self.name
330            ));
331        }
332
333        if let Some(logical_type) = &self.logical_type {
334            // If a converted type is populated, check that it is consistent with
335            // its logical type
336            if self.converted_type != ConvertedType::NONE {
337                if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
338                    return Err(general_err!(
339                        "Logical type {:?} is incompatible with converted type {} for field '{}'",
340                        logical_type,
341                        self.converted_type,
342                        self.name
343                    ));
344                }
345            } else {
346                // Populate the converted type for backwards compatibility
347                basic_info.converted_type = self.logical_type.clone().into();
348            }
349            // Check that logical type and physical type are compatible
350            match (logical_type, self.physical_type) {
351                (LogicalType::Map, _) | (LogicalType::List, _) => {
352                    return Err(general_err!(
353                        "{:?} cannot be applied to a primitive type for field '{}'",
354                        logical_type,
355                        self.name
356                    ));
357                }
358                (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
359                (LogicalType::Decimal { scale, precision }, _) => {
360                    // Check that scale and precision are consistent with legacy values
361                    if *scale != self.scale {
362                        return Err(general_err!(
363                            "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
364                            scale,
365                            self.scale,
366                            self.name
367                        ));
368                    }
369                    if *precision != self.precision {
370                        return Err(general_err!(
371                            "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
372                            precision,
373                            self.precision,
374                            self.name
375                        ));
376                    }
377                    self.check_decimal_precision_scale()?;
378                }
379                (LogicalType::Date, PhysicalType::INT32) => {}
380                (
381                    LogicalType::Time {
382                        unit: TimeUnit::MILLIS,
383                        ..
384                    },
385                    PhysicalType::INT32,
386                ) => {}
387                (LogicalType::Time { unit, .. }, PhysicalType::INT64) => {
388                    if *unit == TimeUnit::MILLIS {
389                        return Err(general_err!(
390                            "Cannot use millisecond unit on INT64 type for field '{}'",
391                            self.name
392                        ));
393                    }
394                }
395                (LogicalType::Timestamp { .. }, PhysicalType::INT64) => {}
396                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT32)
397                    if *bit_width <= 32 => {}
398                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
399                    if *bit_width == 64 => {}
400                // Null type
401                (LogicalType::Unknown, PhysicalType::INT32) => {}
402                (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
403                (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
404                (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
405                (LogicalType::Geometry { .. }, PhysicalType::BYTE_ARRAY) => {}
406                (LogicalType::Geography { .. }, PhysicalType::BYTE_ARRAY) => {}
407                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
408                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
409                    return Err(general_err!(
410                        "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
411                        self.name
412                    ));
413                }
414                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 2 => {}
415                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
416                    return Err(general_err!(
417                        "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
418                        self.name
419                    ));
420                }
421                // unknown logical type means just use physical type
422                (LogicalType::_Unknown { .. }, _) => {}
423                (a, b) => {
424                    return Err(general_err!(
425                        "Cannot annotate {:?} from {} for field '{}'",
426                        a,
427                        b,
428                        self.name
429                    ));
430                }
431            }
432        }
433
434        match self.converted_type {
435            ConvertedType::NONE => {}
436            ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
437                if self.physical_type != PhysicalType::BYTE_ARRAY {
438                    return Err(general_err!(
439                        "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
440                        self.converted_type,
441                        self.name
442                    ));
443                }
444            }
445            ConvertedType::DECIMAL => {
446                self.check_decimal_precision_scale()?;
447            }
448            ConvertedType::DATE
449            | ConvertedType::TIME_MILLIS
450            | ConvertedType::UINT_8
451            | ConvertedType::UINT_16
452            | ConvertedType::UINT_32
453            | ConvertedType::INT_8
454            | ConvertedType::INT_16
455            | ConvertedType::INT_32 => {
456                if self.physical_type != PhysicalType::INT32 {
457                    return Err(general_err!(
458                        "{} cannot annotate field '{}' because it is not a INT32 field",
459                        self.converted_type,
460                        self.name
461                    ));
462                }
463            }
464            ConvertedType::TIME_MICROS
465            | ConvertedType::TIMESTAMP_MILLIS
466            | ConvertedType::TIMESTAMP_MICROS
467            | ConvertedType::UINT_64
468            | ConvertedType::INT_64 => {
469                if self.physical_type != PhysicalType::INT64 {
470                    return Err(general_err!(
471                        "{} cannot annotate field '{}' because it is not a INT64 field",
472                        self.converted_type,
473                        self.name
474                    ));
475                }
476            }
477            ConvertedType::INTERVAL => {
478                if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
479                    return Err(general_err!(
480                        "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
481                        self.name
482                    ));
483                }
484            }
485            ConvertedType::ENUM => {
486                if self.physical_type != PhysicalType::BYTE_ARRAY {
487                    return Err(general_err!(
488                        "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
489                        self.name
490                    ));
491                }
492            }
493            _ => {
494                return Err(general_err!(
495                    "{} cannot be applied to primitive field '{}'",
496                    self.converted_type,
497                    self.name
498                ));
499            }
500        }
501
502        Ok(Type::PrimitiveType {
503            basic_info,
504            physical_type: self.physical_type,
505            type_length: self.length,
506            scale: self.scale,
507            precision: self.precision,
508        })
509    }
510
511    #[inline]
512    fn check_decimal_precision_scale(&self) -> Result<()> {
513        match self.physical_type {
514            PhysicalType::INT32
515            | PhysicalType::INT64
516            | PhysicalType::BYTE_ARRAY
517            | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
518            _ => {
519                return Err(general_err!(
520                    "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
521                ));
522            }
523        }
524
525        // Precision is required and must be a non-zero positive integer.
526        if self.precision < 1 {
527            return Err(general_err!(
528                "Invalid DECIMAL precision: {}",
529                self.precision
530            ));
531        }
532
533        // Scale must be zero or a positive integer less than the precision.
534        if self.scale < 0 {
535            return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
536        }
537
538        if self.scale > self.precision {
539            return Err(general_err!(
540                "Invalid DECIMAL: scale ({}) cannot be greater than precision \
541             ({})",
542                self.scale,
543                self.precision
544            ));
545        }
546
547        // Check precision and scale based on physical type limitations.
548        match self.physical_type {
549            PhysicalType::INT32 => {
550                if self.precision > 9 {
551                    return Err(general_err!(
552                        "Cannot represent INT32 as DECIMAL with precision {}",
553                        self.precision
554                    ));
555                }
556            }
557            PhysicalType::INT64 => {
558                if self.precision > 18 {
559                    return Err(general_err!(
560                        "Cannot represent INT64 as DECIMAL with precision {}",
561                        self.precision
562                    ));
563                }
564            }
565            PhysicalType::FIXED_LEN_BYTE_ARRAY => {
566                let length = self
567                    .length
568                    .checked_mul(8)
569                    .ok_or(general_err!("Invalid length {} for Decimal", self.length))?;
570                let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32;
571
572                if self.precision > max_precision {
573                    return Err(general_err!(
574                        "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
575                        precision {}. The max precision can only be {}",
576                        self.length,
577                        self.precision,
578                        max_precision
579                    ));
580                }
581            }
582            _ => (), // For BYTE_ARRAY precision is not limited
583        }
584
585        Ok(())
586    }
587}
588
589/// A builder for group types. All attributes are optional except the name.
590/// Note that if not specified explicitly, `None` is used as the repetition of the group,
591/// which means it is a root (message) type.
592pub struct GroupTypeBuilder<'a> {
593    name: &'a str,
594    repetition: Option<Repetition>,
595    converted_type: ConvertedType,
596    logical_type: Option<LogicalType>,
597    fields: Vec<TypePtr>,
598    id: Option<i32>,
599}
600
601impl<'a> GroupTypeBuilder<'a> {
602    /// Creates new group type builder with provided field name.
603    pub fn new(name: &'a str) -> Self {
604        Self {
605            name,
606            repetition: None,
607            converted_type: ConvertedType::NONE,
608            logical_type: None,
609            fields: Vec::new(),
610            id: None,
611        }
612    }
613
614    /// Sets [`Repetition`] for this field and returns itself.
615    pub fn with_repetition(mut self, repetition: Repetition) -> Self {
616        self.repetition = Some(repetition);
617        self
618    }
619
620    /// Sets [`ConvertedType`] for this field and returns itself.
621    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
622        Self {
623            converted_type,
624            ..self
625        }
626    }
627
628    /// Sets [`LogicalType`] for this field and returns itself.
629    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
630        Self {
631            logical_type,
632            ..self
633        }
634    }
635
636    /// Sets a list of fields that should be child nodes of this field.
637    /// Returns updated self.
638    pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
639        Self { fields, ..self }
640    }
641
642    /// Sets optional field id and returns itself.
643    pub fn with_id(self, id: Option<i32>) -> Self {
644        Self { id, ..self }
645    }
646
647    /// Creates a new `GroupType` instance from the gathered attributes.
648    pub fn build(self) -> Result<Type> {
649        let mut basic_info = BasicTypeInfo {
650            name: String::from(self.name),
651            repetition: self.repetition,
652            converted_type: self.converted_type,
653            logical_type: self.logical_type.clone(),
654            id: self.id,
655        };
656        // Populate the converted type if only the logical type is populated
657        if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
658            basic_info.converted_type = self.logical_type.into();
659        }
660        Ok(Type::GroupType {
661            basic_info,
662            fields: self.fields,
663        })
664    }
665}
666
667/// Basic type info. This contains information such as the name of the type,
668/// the repetition level, the logical type and the kind of the type (group, primitive).
669#[derive(Clone, Debug, PartialEq, Eq)]
670pub struct BasicTypeInfo {
671    name: String,
672    repetition: Option<Repetition>,
673    converted_type: ConvertedType,
674    logical_type: Option<LogicalType>,
675    id: Option<i32>,
676}
677
678impl HeapSize for BasicTypeInfo {
679    fn heap_size(&self) -> usize {
680        // no heap allocations in any other subfield
681        self.name.heap_size()
682    }
683}
684
685impl BasicTypeInfo {
686    /// Returns field name.
687    pub fn name(&self) -> &str {
688        &self.name
689    }
690
691    /// Returns `true` if type has repetition field set, `false` otherwise.
692    /// This is mostly applied to group type, because primitive type always has
693    /// repetition set.
694    pub fn has_repetition(&self) -> bool {
695        self.repetition.is_some()
696    }
697
698    /// Returns [`Repetition`] value for the type.
699    pub fn repetition(&self) -> Repetition {
700        assert!(self.repetition.is_some());
701        self.repetition.unwrap()
702    }
703
704    /// Returns [`ConvertedType`] value for the type.
705    pub fn converted_type(&self) -> ConvertedType {
706        self.converted_type
707    }
708
709    /// Returns [`LogicalType`] value for the type.
710    ///
711    /// Note that this function will clone the `LogicalType`. If performance is a concern,
712    /// use [`Self::logical_type_ref`] instead.
713    #[deprecated(
714        since = "57.1.0",
715        note = "use `BasicTypeInfo::logical_type_ref` instead (LogicalType cloning is non trivial)"
716    )]
717    pub fn logical_type(&self) -> Option<LogicalType> {
718        // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it
719        self.logical_type.clone()
720    }
721
722    /// Return a reference to the [`LogicalType`] value for the type.
723    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
724        self.logical_type.as_ref()
725    }
726
727    /// Returns `true` if id is set, `false` otherwise.
728    pub fn has_id(&self) -> bool {
729        self.id.is_some()
730    }
731
732    /// Returns id value for the type.
733    pub fn id(&self) -> i32 {
734        assert!(self.id.is_some());
735        self.id.unwrap()
736    }
737}
738
739// ----------------------------------------------------------------------
740// Parquet descriptor definitions
741
742/// Represents the location of a column in a Parquet schema
743///
744/// # Example: refer to column named `'my_column'`
745/// ```
746/// # use parquet::schema::types::ColumnPath;
747/// let column_path = ColumnPath::from("my_column");
748/// ```
749///
750/// # Example: refer to column named `c` in a nested struct `{a: {b: {c: ...}}}`
751/// ```
752/// # use parquet::schema::types::ColumnPath;
753/// // form path 'a.b.c'
754/// let column_path = ColumnPath::from(vec![
755///   String::from("a"),
756///   String::from("b"),
757///   String::from("c")
758/// ]);
759/// ```
760#[derive(Clone, PartialEq, Debug, Eq, Hash)]
761pub struct ColumnPath {
762    parts: Vec<String>,
763}
764
765impl HeapSize for ColumnPath {
766    fn heap_size(&self) -> usize {
767        self.parts.heap_size()
768    }
769}
770
771impl ColumnPath {
772    /// Creates new column path from vector of field names.
773    pub fn new(parts: Vec<String>) -> Self {
774        ColumnPath { parts }
775    }
776
777    /// Returns string representation of this column path.
778    /// ```rust
779    /// use parquet::schema::types::ColumnPath;
780    ///
781    /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
782    /// assert_eq!(&path.string(), "a.b.c");
783    /// ```
784    pub fn string(&self) -> String {
785        self.parts.join(".")
786    }
787
788    /// Appends more components to end of column path.
789    /// ```rust
790    /// use parquet::schema::types::ColumnPath;
791    ///
792    /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c"
793    /// .to_string()]);
794    /// assert_eq!(&path.string(), "a.b.c");
795    ///
796    /// path.append(vec!["d".to_string(), "e".to_string()]);
797    /// assert_eq!(&path.string(), "a.b.c.d.e");
798    /// ```
799    pub fn append(&mut self, mut tail: Vec<String>) {
800        self.parts.append(&mut tail);
801    }
802
803    /// Returns a slice of path components.
804    pub fn parts(&self) -> &[String] {
805        &self.parts
806    }
807}
808
809impl fmt::Display for ColumnPath {
810    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
811        write!(f, "{:?}", self.string())
812    }
813}
814
815impl From<Vec<String>> for ColumnPath {
816    fn from(parts: Vec<String>) -> Self {
817        ColumnPath { parts }
818    }
819}
820
821impl From<&str> for ColumnPath {
822    fn from(single_path: &str) -> Self {
823        let s = String::from(single_path);
824        ColumnPath::from(s)
825    }
826}
827
828impl From<String> for ColumnPath {
829    fn from(single_path: String) -> Self {
830        let v = vec![single_path];
831        ColumnPath { parts: v }
832    }
833}
834
835impl AsRef<[String]> for ColumnPath {
836    fn as_ref(&self) -> &[String] {
837        &self.parts
838    }
839}
840
841/// Physical type for leaf-level primitive columns.
842///
843/// Also includes the maximum definition and repetition levels required to
844/// re-assemble nested data.
845#[derive(Debug, PartialEq)]
846pub struct ColumnDescriptor {
847    /// The "leaf" primitive type of this column
848    primitive_type: TypePtr,
849
850    /// The maximum definition level for this column
851    max_def_level: i16,
852
853    /// The maximum repetition level for this column
854    max_rep_level: i16,
855
856    /// The definition level at the nearest REPEATED ancestor, or 0 if none.
857    repeated_ancestor_def_level: i16,
858
859    /// The path of this column. For instance, "a.b.c.d".
860    path: ColumnPath,
861}
862
863impl HeapSize for ColumnDescriptor {
864    fn heap_size(&self) -> usize {
865        // Don't include the heap size of primitive_type, this is already
866        // accounted for via SchemaDescriptor::schema
867        self.path.heap_size()
868    }
869}
870
871impl ColumnDescriptor {
872    /// Creates new descriptor for leaf-level column.
873    pub fn new(
874        primitive_type: TypePtr,
875        max_def_level: i16,
876        max_rep_level: i16,
877        path: ColumnPath,
878    ) -> Self {
879        Self::new_with_repeated_ancestor(primitive_type, max_def_level, max_rep_level, path, 0)
880    }
881
882    pub(crate) fn new_with_repeated_ancestor(
883        primitive_type: TypePtr,
884        max_def_level: i16,
885        max_rep_level: i16,
886        path: ColumnPath,
887        repeated_ancestor_def_level: i16,
888    ) -> Self {
889        Self {
890            primitive_type,
891            max_def_level,
892            max_rep_level,
893            repeated_ancestor_def_level,
894            path,
895        }
896    }
897
898    /// Returns maximum definition level for this column.
899    #[inline]
900    pub fn max_def_level(&self) -> i16 {
901        self.max_def_level
902    }
903
904    /// Returns maximum repetition level for this column.
905    #[inline]
906    pub fn max_rep_level(&self) -> i16 {
907        self.max_rep_level
908    }
909
910    /// Returns the definition level at the nearest REPEATED ancestor, or 0 if none.
911    #[inline]
912    pub fn repeated_ancestor_def_level(&self) -> i16 {
913        self.repeated_ancestor_def_level
914    }
915
916    /// Returns [`ColumnPath`] for this column.
917    pub fn path(&self) -> &ColumnPath {
918        &self.path
919    }
920
921    /// Returns self type [`Type`] for this leaf column.
922    pub fn self_type(&self) -> &Type {
923        self.primitive_type.as_ref()
924    }
925
926    /// Returns self type [`TypePtr`]  for this leaf
927    /// column.
928    pub fn self_type_ptr(&self) -> TypePtr {
929        self.primitive_type.clone()
930    }
931
932    /// Returns column name.
933    pub fn name(&self) -> &str {
934        self.primitive_type.name()
935    }
936
937    /// Returns [`ConvertedType`] for this column.
938    pub fn converted_type(&self) -> ConvertedType {
939        self.primitive_type.get_basic_info().converted_type()
940    }
941
942    /// Returns [`LogicalType`] for this column.
943    ///
944    /// Note that this function will clone the `LogicalType`. If performance is a concern,
945    /// use [`Self::logical_type_ref`] instead.
946    #[deprecated(
947        since = "57.1.0",
948        note = "use `ColumnDescriptor::logical_type_ref` instead (LogicalType cloning is non trivial)"
949    )]
950    pub fn logical_type(&self) -> Option<LogicalType> {
951        self.primitive_type
952            .get_basic_info()
953            .logical_type_ref()
954            .cloned()
955    }
956
957    /// Returns a reference to the [`LogicalType`] for this column.
958    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
959        self.primitive_type.get_basic_info().logical_type_ref()
960    }
961
962    /// Returns physical type for this column.
963    /// Note that it will panic if called on a non-primitive type.
964    pub fn physical_type(&self) -> PhysicalType {
965        match self.primitive_type.as_ref() {
966            Type::PrimitiveType { physical_type, .. } => *physical_type,
967            _ => panic!("Expected primitive type!"),
968        }
969    }
970
971    /// Returns type length for this column.
972    /// Note that it will panic if called on a non-primitive type.
973    pub fn type_length(&self) -> i32 {
974        match self.primitive_type.as_ref() {
975            Type::PrimitiveType { type_length, .. } => *type_length,
976            _ => panic!("Expected primitive type!"),
977        }
978    }
979
980    /// Returns type precision for this column.
981    /// Note that it will panic if called on a non-primitive type.
982    pub fn type_precision(&self) -> i32 {
983        match self.primitive_type.as_ref() {
984            Type::PrimitiveType { precision, .. } => *precision,
985            _ => panic!("Expected primitive type!"),
986        }
987    }
988
989    /// Returns type scale for this column.
990    /// Note that it will panic if called on a non-primitive type.
991    pub fn type_scale(&self) -> i32 {
992        match self.primitive_type.as_ref() {
993            Type::PrimitiveType { scale, .. } => *scale,
994            _ => panic!("Expected primitive type!"),
995        }
996    }
997
998    /// Returns the sort order for this column
999    pub fn sort_order(&self) -> SortOrder {
1000        ColumnOrder::sort_order_for_type(
1001            self.logical_type_ref(),
1002            self.converted_type(),
1003            self.physical_type(),
1004        )
1005    }
1006}
1007
1008/// Schema of a Parquet file.
1009///
1010/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
1011/// each primitive (leaf) column.
1012///
1013/// # Example
1014/// ```
1015/// # use std::sync::Arc;
1016/// use parquet::schema::types::{SchemaDescriptor, Type};
1017/// use parquet::basic; // note there are two `Type`s that are different
1018/// // Schema for a table with two columns: "a" (int64) and "b" (int32, stored as a date)
1019/// let descriptor = SchemaDescriptor::new(
1020///   Arc::new(
1021///     Type::group_type_builder("my_schema")
1022///       .with_fields(vec![
1023///         Arc::new(
1024///          Type::primitive_type_builder("a", basic::Type::INT64)
1025///           .build().unwrap()
1026///         ),
1027///         Arc::new(
1028///          Type::primitive_type_builder("b", basic::Type::INT32)
1029///           .with_converted_type(basic::ConvertedType::DATE)
1030///           .with_logical_type(Some(basic::LogicalType::Date))
1031///           .build().unwrap()
1032///         ),
1033///      ])
1034///      .build().unwrap()
1035///   )
1036/// );
1037/// ```
1038#[derive(PartialEq, Clone)]
1039pub struct SchemaDescriptor {
1040    /// The top-level logical schema (the "message" type).
1041    ///
1042    /// This must be a [`Type::GroupType`] where each field is a root
1043    /// column type in the schema.
1044    schema: TypePtr,
1045
1046    /// The descriptors for the physical type of each leaf column in this schema
1047    ///
1048    /// Constructed from `schema` in DFS order.
1049    leaves: Vec<ColumnDescPtr>,
1050
1051    /// Mapping from a leaf column's index to the root column index that it
1052    /// comes from.
1053    ///
1054    /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
1055    /// ```text
1056    /// -- a  <-----+
1057    /// -- -- b     |
1058    /// -- -- -- c  |
1059    /// -- -- -- -- d
1060    /// ```
1061    leaf_to_base: Vec<usize>,
1062}
1063
1064impl fmt::Debug for SchemaDescriptor {
1065    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1066        // Skip leaves and leaf_to_base as they only a cache information already found in `schema`
1067        f.debug_struct("SchemaDescriptor")
1068            .field("schema", &self.schema)
1069            .finish()
1070    }
1071}
1072
1073// Need to implement HeapSize in this module as the fields are private
1074impl HeapSize for SchemaDescriptor {
1075    fn heap_size(&self) -> usize {
1076        self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
1077    }
1078}
1079
1080impl SchemaDescriptor {
1081    /// Creates new schema descriptor from Parquet schema.
1082    pub fn new(tp: TypePtr) -> Self {
1083        const INIT_SCHEMA_DEPTH: usize = 16;
1084        assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
1085        // unwrap should be safe since we just asserted tp is a group
1086        let n_leaves = num_leaves(&tp).unwrap();
1087        let mut leaves = Vec::with_capacity(n_leaves);
1088        let mut leaf_to_base = Vec::with_capacity(n_leaves);
1089        let mut path = Vec::with_capacity(INIT_SCHEMA_DEPTH);
1090        for (root_idx, f) in tp.get_fields().iter().enumerate() {
1091            path.clear();
1092            build_tree(
1093                f,
1094                root_idx,
1095                0,
1096                0,
1097                0,
1098                &mut leaves,
1099                &mut leaf_to_base,
1100                &mut path,
1101            );
1102        }
1103
1104        Self {
1105            schema: tp,
1106            leaves,
1107            leaf_to_base,
1108        }
1109    }
1110
1111    /// Returns [`ColumnDescriptor`] for a field position.
1112    pub fn column(&self, i: usize) -> ColumnDescPtr {
1113        assert!(
1114            i < self.leaves.len(),
1115            "Index out of bound: {} not in [0, {})",
1116            i,
1117            self.leaves.len()
1118        );
1119        self.leaves[i].clone()
1120    }
1121
1122    /// Returns slice of [`ColumnDescriptor`].
1123    pub fn columns(&self) -> &[ColumnDescPtr] {
1124        &self.leaves
1125    }
1126
1127    /// Returns number of leaf-level columns.
1128    pub fn num_columns(&self) -> usize {
1129        self.leaves.len()
1130    }
1131
1132    /// Returns column root [`Type`] for a leaf position.
1133    pub fn get_column_root(&self, i: usize) -> &Type {
1134        let result = self.column_root_of(i);
1135        result.as_ref()
1136    }
1137
1138    /// Returns column root [`Type`] pointer for a leaf position.
1139    pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1140        let result = self.column_root_of(i);
1141        result.clone()
1142    }
1143
1144    /// Returns the index of the root column for a field position
1145    pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1146        assert!(
1147            leaf < self.leaves.len(),
1148            "Index out of bound: {} not in [0, {})",
1149            leaf,
1150            self.leaves.len()
1151        );
1152
1153        *self
1154            .leaf_to_base
1155            .get(leaf)
1156            .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1157    }
1158
1159    fn column_root_of(&self, i: usize) -> &TypePtr {
1160        &self.schema.get_fields()[self.get_column_root_idx(i)]
1161    }
1162
1163    /// Returns schema as [`Type`].
1164    pub fn root_schema(&self) -> &Type {
1165        self.schema.as_ref()
1166    }
1167
1168    /// Returns schema as [`TypePtr`] for cheap cloning.
1169    pub fn root_schema_ptr(&self) -> TypePtr {
1170        self.schema.clone()
1171    }
1172
1173    /// Returns schema name.
1174    pub fn name(&self) -> &str {
1175        self.schema.name()
1176    }
1177}
1178
1179// walk tree and count nodes
1180pub(crate) fn num_nodes(tp: &TypePtr) -> Result<usize> {
1181    if !tp.is_group() {
1182        return Err(general_err!("Root schema must be Group type"));
1183    }
1184    let mut n_nodes = 1usize; // count root
1185    for f in tp.get_fields().iter() {
1186        count_nodes(f, &mut n_nodes);
1187    }
1188    Ok(n_nodes)
1189}
1190
1191pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) {
1192    *n_nodes += 1;
1193    if let Type::GroupType { fields, .. } = tp.as_ref() {
1194        for f in fields {
1195            count_nodes(f, n_nodes);
1196        }
1197    }
1198}
1199
1200// do a quick walk of the tree to get proper sizing for SchemaDescriptor arrays
1201fn num_leaves(tp: &TypePtr) -> Result<usize> {
1202    if !tp.is_group() {
1203        return Err(general_err!("Root schema must be Group type"));
1204    }
1205    let mut n_leaves = 0usize;
1206    for f in tp.get_fields().iter() {
1207        count_leaves(f, &mut n_leaves);
1208    }
1209    Ok(n_leaves)
1210}
1211
1212fn count_leaves(tp: &TypePtr, n_leaves: &mut usize) {
1213    match tp.as_ref() {
1214        Type::PrimitiveType { .. } => *n_leaves += 1,
1215        Type::GroupType { fields, .. } => {
1216            for f in fields {
1217                count_leaves(f, n_leaves);
1218            }
1219        }
1220    }
1221}
1222
1223#[allow(clippy::too_many_arguments)]
1224fn build_tree<'a>(
1225    tp: &'a TypePtr,
1226    root_idx: usize,
1227    mut max_rep_level: i16,
1228    mut max_def_level: i16,
1229    mut repeated_ancestor_def_level: i16,
1230    leaves: &mut Vec<ColumnDescPtr>,
1231    leaf_to_base: &mut Vec<usize>,
1232    path_so_far: &mut Vec<&'a str>,
1233) {
1234    assert!(tp.get_basic_info().has_repetition());
1235
1236    path_so_far.push(tp.name());
1237    match tp.get_basic_info().repetition() {
1238        Repetition::OPTIONAL => {
1239            max_def_level += 1;
1240        }
1241        Repetition::REPEATED => {
1242            max_def_level += 1;
1243            max_rep_level += 1;
1244            repeated_ancestor_def_level = max_def_level;
1245        }
1246        _ => {}
1247    }
1248
1249    match tp.as_ref() {
1250        Type::PrimitiveType { .. } => {
1251            let mut path: Vec<String> = vec![];
1252            path.extend(path_so_far.iter().copied().map(String::from));
1253            let desc = ColumnDescriptor::new_with_repeated_ancestor(
1254                tp.clone(),
1255                max_def_level,
1256                max_rep_level,
1257                ColumnPath::new(path),
1258                repeated_ancestor_def_level,
1259            );
1260            leaves.push(Arc::new(desc));
1261            leaf_to_base.push(root_idx);
1262        }
1263        Type::GroupType { fields, .. } => {
1264            for f in fields {
1265                build_tree(
1266                    f,
1267                    root_idx,
1268                    max_rep_level,
1269                    max_def_level,
1270                    repeated_ancestor_def_level,
1271                    leaves,
1272                    leaf_to_base,
1273                    path_so_far,
1274                );
1275                path_so_far.pop();
1276            }
1277        }
1278    }
1279}
1280
1281/// Checks if the logical type is valid.
1282fn check_logical_type(logical_type: &Option<LogicalType>) -> Result<()> {
1283    if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type {
1284        if bit_width != 8 && bit_width != 16 && bit_width != 32 && bit_width != 64 {
1285            return Err(general_err!(
1286                "Bit width must be 8, 16, 32, or 64 for Integer logical type"
1287            ));
1288        }
1289    }
1290    Ok(())
1291}
1292
1293// convert thrift decoded array of `SchemaElement` into this crate's representation of
1294// parquet types. this function consumes `elements`.
1295pub(crate) fn parquet_schema_from_array<'a>(elements: Vec<SchemaElement<'a>>) -> Result<TypePtr> {
1296    let mut index = 0;
1297    let num_elements = elements.len();
1298    let mut schema_nodes = Vec::with_capacity(1); // there should only be one element when done
1299
1300    // turn into iterator so we can take ownership of elements of the vector
1301    let mut elements = elements.into_iter();
1302
1303    while index < num_elements {
1304        let t = schema_from_array_helper(&mut elements, num_elements, index)?;
1305        index = t.0;
1306        schema_nodes.push(t.1);
1307    }
1308    if schema_nodes.len() != 1 {
1309        return Err(general_err!(
1310            "Expected exactly one root node, but found {}",
1311            schema_nodes.len()
1312        ));
1313    }
1314
1315    if !schema_nodes[0].is_group() {
1316        return Err(general_err!("Expected root node to be a group type"));
1317    }
1318
1319    Ok(schema_nodes.remove(0))
1320}
1321
1322// recursive helper function for schema conversion
1323fn schema_from_array_helper<'a>(
1324    elements: &mut IntoIter<SchemaElement<'a>>,
1325    num_elements: usize,
1326    index: usize,
1327) -> Result<(usize, TypePtr)> {
1328    // Whether or not the current node is root (message type).
1329    // There is only one message type node in the schema tree.
1330    let is_root_node = index == 0;
1331
1332    if index >= num_elements {
1333        return Err(general_err!(
1334            "Index out of bound, index = {}, len = {}",
1335            index,
1336            num_elements
1337        ));
1338    }
1339    let element = elements.next().expect("schema vector should not be empty");
1340
1341    // Check for empty schema
1342    if let (true, None | Some(0)) = (is_root_node, element.num_children) {
1343        let builder = Type::group_type_builder(element.name);
1344        return Ok((index + 1, Arc::new(builder.build().unwrap())));
1345    }
1346
1347    let converted_type = element.converted_type.unwrap_or(ConvertedType::NONE);
1348
1349    // LogicalType is prefered to ConvertedType, but both may be present.
1350    let logical_type = element.logical_type;
1351
1352    check_logical_type(&logical_type)?;
1353
1354    let field_id = element.field_id;
1355    match element.num_children {
1356        // From parquet-format:
1357        //   The children count is used to construct the nested relationship.
1358        //   This field is not set when the element is a primitive type
1359        // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we
1360        // have to handle this case too.
1361        None | Some(0) => {
1362            // primitive type
1363            if element.repetition_type.is_none() {
1364                return Err(general_err!(
1365                    "Repetition level must be defined for a primitive type"
1366                ));
1367            }
1368            let repetition = element.repetition_type.unwrap();
1369            if let Some(physical_type) = element.r#type {
1370                let length = element.type_length.unwrap_or(-1);
1371                let scale = element.scale.unwrap_or(-1);
1372                let precision = element.precision.unwrap_or(-1);
1373                let name = element.name;
1374                let builder = Type::primitive_type_builder(name, physical_type)
1375                    .with_repetition(repetition)
1376                    .with_converted_type(converted_type)
1377                    .with_logical_type(logical_type)
1378                    .with_length(length)
1379                    .with_precision(precision)
1380                    .with_scale(scale)
1381                    .with_id(field_id);
1382                Ok((index + 1, Arc::new(builder.build()?)))
1383            } else {
1384                let mut builder = Type::group_type_builder(element.name)
1385                    .with_converted_type(converted_type)
1386                    .with_logical_type(logical_type)
1387                    .with_id(field_id);
1388                if !is_root_node {
1389                    // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1390                    // REPEATED for root node.
1391                    //
1392                    // We only set repetition for group types that are not top-level message
1393                    // type. According to parquet-format:
1394                    //   Root of the schema does not have a repetition_type.
1395                    //   All other types must have one.
1396                    builder = builder.with_repetition(repetition);
1397                }
1398                Ok((index + 1, Arc::new(builder.build().unwrap())))
1399            }
1400        }
1401        Some(n) => {
1402            let repetition = element.repetition_type;
1403
1404            let mut fields = Vec::with_capacity(n as usize);
1405            let mut next_index = index + 1;
1406            for _ in 0..n {
1407                let child_result = schema_from_array_helper(elements, num_elements, next_index)?;
1408                next_index = child_result.0;
1409                fields.push(child_result.1);
1410            }
1411
1412            let mut builder = Type::group_type_builder(element.name)
1413                .with_converted_type(converted_type)
1414                .with_logical_type(logical_type)
1415                .with_fields(fields)
1416                .with_id(field_id);
1417
1418            // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1419            // REPEATED for root node.
1420            //
1421            // We only set repetition for group types that are not top-level message
1422            // type. According to parquet-format:
1423            //   Root of the schema does not have a repetition_type.
1424            //   All other types must have one.
1425            if !is_root_node {
1426                let Some(rep) = repetition else {
1427                    return Err(general_err!(
1428                        "Repetition level must be defined for non-root types"
1429                    ));
1430                };
1431                builder = builder.with_repetition(rep);
1432            }
1433            Ok((next_index, Arc::new(builder.build()?)))
1434        }
1435    }
1436}
1437
1438#[cfg(test)]
1439mod tests {
1440    use super::*;
1441
1442    use crate::{
1443        file::metadata::thrift::tests::{buf_to_schema_list, roundtrip_schema, schema_to_buf},
1444        schema::parser::parse_message_type,
1445    };
1446
1447    // TODO: add tests for v2 types
1448
1449    #[test]
1450    fn test_primitive_type() {
1451        let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1452            .with_logical_type(Some(LogicalType::Integer {
1453                bit_width: 32,
1454                is_signed: true,
1455            }))
1456            .with_id(Some(0))
1457            .build();
1458        assert!(result.is_ok());
1459
1460        if let Ok(tp) = result {
1461            assert!(tp.is_primitive());
1462            assert!(!tp.is_group());
1463            let basic_info = tp.get_basic_info();
1464            assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1465            assert_eq!(
1466                basic_info.logical_type_ref(),
1467                Some(&LogicalType::Integer {
1468                    bit_width: 32,
1469                    is_signed: true
1470                })
1471            );
1472            assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1473            assert_eq!(basic_info.id(), 0);
1474            match tp {
1475                Type::PrimitiveType { physical_type, .. } => {
1476                    assert_eq!(physical_type, PhysicalType::INT32);
1477                }
1478                _ => panic!(),
1479            }
1480        }
1481
1482        // Test illegal inputs with logical type
1483        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1484            .with_repetition(Repetition::REPEATED)
1485            .with_logical_type(Some(LogicalType::Integer {
1486                is_signed: true,
1487                bit_width: 8,
1488            }))
1489            .build();
1490        assert!(result.is_err());
1491        if let Err(e) = result {
1492            assert_eq!(
1493                format!("{e}"),
1494                "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'"
1495            );
1496        }
1497
1498        // Test illegal inputs with converted type
1499        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1500            .with_repetition(Repetition::REPEATED)
1501            .with_converted_type(ConvertedType::BSON)
1502            .build();
1503        assert!(result.is_err());
1504        if let Err(e) = result {
1505            assert_eq!(
1506                format!("{e}"),
1507                "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1508            );
1509        }
1510
1511        result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1512            .with_repetition(Repetition::REQUIRED)
1513            .with_converted_type(ConvertedType::DECIMAL)
1514            .with_precision(-1)
1515            .with_scale(-1)
1516            .build();
1517        assert!(result.is_err());
1518        if let Err(e) = result {
1519            assert_eq!(
1520                format!("{e}"),
1521                "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1522            );
1523        }
1524
1525        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1526            .with_repetition(Repetition::REQUIRED)
1527            .with_logical_type(Some(LogicalType::Decimal {
1528                scale: 32,
1529                precision: 12,
1530            }))
1531            .with_precision(-1)
1532            .with_scale(-1)
1533            .build();
1534        assert!(result.is_err());
1535        if let Err(e) = result {
1536            assert_eq!(
1537                format!("{e}"),
1538                "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1539            );
1540        }
1541
1542        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1543            .with_repetition(Repetition::REQUIRED)
1544            .with_converted_type(ConvertedType::DECIMAL)
1545            .with_precision(-1)
1546            .with_scale(-1)
1547            .build();
1548        assert!(result.is_err());
1549        if let Err(e) = result {
1550            assert_eq!(
1551                format!("{e}"),
1552                "Parquet error: Invalid DECIMAL precision: -1"
1553            );
1554        }
1555
1556        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1557            .with_repetition(Repetition::REQUIRED)
1558            .with_converted_type(ConvertedType::DECIMAL)
1559            .with_precision(0)
1560            .with_scale(-1)
1561            .build();
1562        assert!(result.is_err());
1563        if let Err(e) = result {
1564            assert_eq!(
1565                format!("{e}"),
1566                "Parquet error: Invalid DECIMAL precision: 0"
1567            );
1568        }
1569
1570        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1571            .with_repetition(Repetition::REQUIRED)
1572            .with_converted_type(ConvertedType::DECIMAL)
1573            .with_precision(1)
1574            .with_scale(-1)
1575            .build();
1576        assert!(result.is_err());
1577        if let Err(e) = result {
1578            assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1579        }
1580
1581        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1582            .with_repetition(Repetition::REQUIRED)
1583            .with_converted_type(ConvertedType::DECIMAL)
1584            .with_precision(1)
1585            .with_scale(2)
1586            .build();
1587        assert!(result.is_err());
1588        if let Err(e) = result {
1589            assert_eq!(
1590                format!("{e}"),
1591                "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1592            );
1593        }
1594
1595        // It is OK if precision == scale
1596        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1597            .with_repetition(Repetition::REQUIRED)
1598            .with_converted_type(ConvertedType::DECIMAL)
1599            .with_precision(1)
1600            .with_scale(1)
1601            .build();
1602        assert!(result.is_ok());
1603
1604        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1605            .with_repetition(Repetition::REQUIRED)
1606            .with_converted_type(ConvertedType::DECIMAL)
1607            .with_precision(18)
1608            .with_scale(2)
1609            .build();
1610        assert!(result.is_err());
1611        if let Err(e) = result {
1612            assert_eq!(
1613                format!("{e}"),
1614                "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1615            );
1616        }
1617
1618        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1619            .with_repetition(Repetition::REQUIRED)
1620            .with_converted_type(ConvertedType::DECIMAL)
1621            .with_precision(32)
1622            .with_scale(2)
1623            .build();
1624        assert!(result.is_err());
1625        if let Err(e) = result {
1626            assert_eq!(
1627                format!("{e}"),
1628                "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1629            );
1630        }
1631
1632        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1633            .with_repetition(Repetition::REQUIRED)
1634            .with_converted_type(ConvertedType::DECIMAL)
1635            .with_length(5)
1636            .with_precision(12)
1637            .with_scale(2)
1638            .build();
1639        assert!(result.is_err());
1640        if let Err(e) = result {
1641            assert_eq!(
1642                format!("{e}"),
1643                "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1644            );
1645        }
1646
1647        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1648            .with_repetition(Repetition::REQUIRED)
1649            .with_converted_type(ConvertedType::UINT_8)
1650            .build();
1651        assert!(result.is_err());
1652        if let Err(e) = result {
1653            assert_eq!(
1654                format!("{e}"),
1655                "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1656            );
1657        }
1658
1659        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1660            .with_repetition(Repetition::REQUIRED)
1661            .with_converted_type(ConvertedType::TIME_MICROS)
1662            .build();
1663        assert!(result.is_err());
1664        if let Err(e) = result {
1665            assert_eq!(
1666                format!("{e}"),
1667                "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1668            );
1669        }
1670
1671        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1672            .with_repetition(Repetition::REQUIRED)
1673            .with_converted_type(ConvertedType::INTERVAL)
1674            .build();
1675        assert!(result.is_err());
1676        if let Err(e) = result {
1677            assert_eq!(
1678                format!("{e}"),
1679                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1680            );
1681        }
1682
1683        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1684            .with_repetition(Repetition::REQUIRED)
1685            .with_converted_type(ConvertedType::INTERVAL)
1686            .with_length(1)
1687            .build();
1688        assert!(result.is_err());
1689        if let Err(e) = result {
1690            assert_eq!(
1691                format!("{e}"),
1692                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1693            );
1694        }
1695
1696        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1697            .with_repetition(Repetition::REQUIRED)
1698            .with_converted_type(ConvertedType::ENUM)
1699            .build();
1700        assert!(result.is_err());
1701        if let Err(e) = result {
1702            assert_eq!(
1703                format!("{e}"),
1704                "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1705            );
1706        }
1707
1708        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1709            .with_repetition(Repetition::REQUIRED)
1710            .with_converted_type(ConvertedType::MAP)
1711            .build();
1712        assert!(result.is_err());
1713        if let Err(e) = result {
1714            assert_eq!(
1715                format!("{e}"),
1716                "Parquet error: MAP cannot be applied to primitive field 'foo'"
1717            );
1718        }
1719
1720        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1721            .with_repetition(Repetition::REQUIRED)
1722            .with_converted_type(ConvertedType::DECIMAL)
1723            .with_length(-1)
1724            .build();
1725        assert!(result.is_err());
1726        if let Err(e) = result {
1727            assert_eq!(
1728                format!("{e}"),
1729                "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1730            );
1731        }
1732
1733        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1734            .with_repetition(Repetition::REQUIRED)
1735            .with_logical_type(Some(LogicalType::Float16))
1736            .with_length(2)
1737            .build();
1738        assert!(result.is_ok());
1739
1740        // Can't be other than FIXED_LEN_BYTE_ARRAY for physical type
1741        result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1742            .with_repetition(Repetition::REQUIRED)
1743            .with_logical_type(Some(LogicalType::Float16))
1744            .with_length(2)
1745            .build();
1746        assert!(result.is_err());
1747        if let Err(e) = result {
1748            assert_eq!(
1749                format!("{e}"),
1750                "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1751            );
1752        }
1753
1754        // Must have length 2
1755        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1756            .with_repetition(Repetition::REQUIRED)
1757            .with_logical_type(Some(LogicalType::Float16))
1758            .with_length(4)
1759            .build();
1760        assert!(result.is_err());
1761        if let Err(e) = result {
1762            assert_eq!(
1763                format!("{e}"),
1764                "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1765            );
1766        }
1767
1768        // Must have length 16
1769        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1770            .with_repetition(Repetition::REQUIRED)
1771            .with_logical_type(Some(LogicalType::Uuid))
1772            .with_length(15)
1773            .build();
1774        assert!(result.is_err());
1775        if let Err(e) = result {
1776            assert_eq!(
1777                format!("{e}"),
1778                "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1779            );
1780        }
1781
1782        // test unknown logical types are ok
1783        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1784            .with_logical_type(Some(LogicalType::_Unknown { field_id: 100 }))
1785            .build();
1786        assert!(result.is_ok());
1787    }
1788
1789    #[test]
1790    fn test_group_type() {
1791        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1792            .with_converted_type(ConvertedType::INT_32)
1793            .with_id(Some(0))
1794            .build();
1795        assert!(f1.is_ok());
1796        let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1797            .with_converted_type(ConvertedType::UTF8)
1798            .with_id(Some(1))
1799            .build();
1800        assert!(f2.is_ok());
1801
1802        let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1803
1804        let result = Type::group_type_builder("foo")
1805            .with_repetition(Repetition::REPEATED)
1806            .with_logical_type(Some(LogicalType::List))
1807            .with_fields(fields)
1808            .with_id(Some(1))
1809            .build();
1810        assert!(result.is_ok());
1811
1812        let tp = result.unwrap();
1813        let basic_info = tp.get_basic_info();
1814        assert!(tp.is_group());
1815        assert!(!tp.is_primitive());
1816        assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1817        assert_eq!(basic_info.logical_type_ref(), Some(&LogicalType::List));
1818        assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1819        assert_eq!(basic_info.id(), 1);
1820        assert_eq!(tp.get_fields().len(), 2);
1821        assert_eq!(tp.get_fields()[0].name(), "f1");
1822        assert_eq!(tp.get_fields()[1].name(), "f2");
1823    }
1824
1825    #[test]
1826    fn test_column_descriptor() {
1827        let result = test_column_descriptor_helper();
1828        assert!(
1829            result.is_ok(),
1830            "Expected result to be OK but got err:\n {}",
1831            result.unwrap_err()
1832        );
1833    }
1834
1835    fn test_column_descriptor_helper() -> Result<()> {
1836        let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1837            .with_converted_type(ConvertedType::UTF8)
1838            .build()?;
1839
1840        let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1841
1842        assert_eq!(descr.path(), &ColumnPath::from("name"));
1843        assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1844        assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1845        assert_eq!(descr.max_def_level(), 4);
1846        assert_eq!(descr.max_rep_level(), 1);
1847        assert_eq!(descr.name(), "name");
1848        assert_eq!(descr.type_length(), -1);
1849        assert_eq!(descr.type_precision(), -1);
1850        assert_eq!(descr.type_scale(), -1);
1851
1852        Ok(())
1853    }
1854
1855    #[test]
1856    fn test_schema_descriptor() {
1857        let result = test_schema_descriptor_helper();
1858        assert!(
1859            result.is_ok(),
1860            "Expected result to be OK but got err:\n {}",
1861            result.unwrap_err()
1862        );
1863    }
1864
1865    // A helper fn to avoid handling the results from type creation
1866    fn test_schema_descriptor_helper() -> Result<()> {
1867        let mut fields = vec![];
1868
1869        let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1870            .with_repetition(Repetition::REQUIRED)
1871            .with_converted_type(ConvertedType::INT_32)
1872            .build()?;
1873        fields.push(Arc::new(inta));
1874        let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1875            .with_converted_type(ConvertedType::INT_64)
1876            .build()?;
1877        fields.push(Arc::new(intb));
1878        let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1879            .with_repetition(Repetition::REPEATED)
1880            .with_converted_type(ConvertedType::UTF8)
1881            .build()?;
1882        fields.push(Arc::new(intc));
1883
1884        // 3-level list encoding
1885        let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1886            .with_repetition(Repetition::REQUIRED)
1887            .with_converted_type(ConvertedType::INT_64)
1888            .build()?;
1889        let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1890        let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1891            .with_repetition(Repetition::REPEATED)
1892            .with_converted_type(ConvertedType::INT_32)
1893            .build()?;
1894        let list = Type::group_type_builder("records")
1895            .with_repetition(Repetition::REPEATED)
1896            .with_converted_type(ConvertedType::LIST)
1897            .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1898            .build()?;
1899        let bag = Type::group_type_builder("bag")
1900            .with_repetition(Repetition::OPTIONAL)
1901            .with_fields(vec![Arc::new(list)])
1902            .build()?;
1903        fields.push(Arc::new(bag));
1904
1905        let schema = Type::group_type_builder("schema")
1906            .with_repetition(Repetition::REPEATED)
1907            .with_fields(fields)
1908            .build()?;
1909        let descr = SchemaDescriptor::new(Arc::new(schema));
1910
1911        let nleaves = 6;
1912        assert_eq!(descr.num_columns(), nleaves);
1913
1914        //                             mdef mrep
1915        // required int32 a            0    0
1916        // optional int64 b            1    0
1917        // repeated byte_array c       1    1
1918        // optional group bag          1    0
1919        //   repeated group records    2    1
1920        //     required int64 item1    2    1
1921        //     optional boolean item2  3    1
1922        //     repeated int32 item3    3    2
1923        let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1924        let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1925
1926        for i in 0..nleaves {
1927            let col = descr.column(i);
1928            assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1929            assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1930        }
1931
1932        assert_eq!(descr.column(0).path().string(), "a");
1933        assert_eq!(descr.column(1).path().string(), "b");
1934        assert_eq!(descr.column(2).path().string(), "c");
1935        assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1936        assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1937        assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1938
1939        assert_eq!(descr.get_column_root(0).name(), "a");
1940        assert_eq!(descr.get_column_root(3).name(), "bag");
1941        assert_eq!(descr.get_column_root(4).name(), "bag");
1942        assert_eq!(descr.get_column_root(5).name(), "bag");
1943
1944        Ok(())
1945    }
1946
1947    #[test]
1948    fn test_schema_build_tree_def_rep_levels() {
1949        let message_type = "
1950    message spark_schema {
1951      REQUIRED INT32 a;
1952      OPTIONAL group b {
1953        OPTIONAL INT32 _1;
1954        OPTIONAL INT32 _2;
1955      }
1956      OPTIONAL group c (LIST) {
1957        REPEATED group list {
1958          OPTIONAL INT32 element;
1959        }
1960      }
1961    }
1962    ";
1963        let schema = parse_message_type(message_type).expect("should parse schema");
1964        let descr = SchemaDescriptor::new(Arc::new(schema));
1965        // required int32 a
1966        assert_eq!(descr.column(0).max_def_level(), 0);
1967        assert_eq!(descr.column(0).max_rep_level(), 0);
1968        // optional int32 b._1
1969        assert_eq!(descr.column(1).max_def_level(), 2);
1970        assert_eq!(descr.column(1).max_rep_level(), 0);
1971        // optional int32 b._2
1972        assert_eq!(descr.column(2).max_def_level(), 2);
1973        assert_eq!(descr.column(2).max_rep_level(), 0);
1974        // repeated optional int32 c.list.element
1975        assert_eq!(descr.column(3).max_def_level(), 3);
1976        assert_eq!(descr.column(3).max_rep_level(), 1);
1977    }
1978
1979    #[test]
1980    fn test_schema_build_tree_repeated_ancestor_def_level() {
1981        // Flat columns: no REPEATED ancestor → repeated_ancestor_def_level = 0
1982        let message_type = "
1983    message m {
1984      REQUIRED INT32 a;
1985      OPTIONAL INT32 b;
1986      OPTIONAL group s {
1987        OPTIONAL INT32 x;
1988      }
1989    }
1990    ";
1991        let schema = parse_message_type(message_type).expect("should parse schema");
1992        let descr = SchemaDescriptor::new(Arc::new(schema));
1993        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 0); // a
1994        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 0); // b
1995        assert_eq!(descr.column(2).repeated_ancestor_def_level(), 0); // s.x
1996
1997        // Standard list: OPTIONAL outer, REPEATED group, OPTIONAL element
1998        // repeated_ancestor_def_level is the def_level at the REPEATED group (= 2)
1999        let message_type = "
2000    message m {
2001      OPTIONAL group c (LIST) {
2002        REPEATED group list {
2003          OPTIONAL INT32 element;
2004        }
2005      }
2006    }
2007    ";
2008        let schema = parse_message_type(message_type).expect("should parse schema");
2009        let descr = SchemaDescriptor::new(Arc::new(schema));
2010        // c(optional)=1, list(repeated)=2, element(optional)=3
2011        assert_eq!(descr.column(0).max_def_level(), 3);
2012        assert_eq!(descr.column(0).max_rep_level(), 1);
2013        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2);
2014
2015        // Required list: REQUIRED outer, REPEATED group, REQUIRED element
2016        // No OPTIONAL nodes between REPEATED and leaf, so repeated_ancestor_def_level == max_def_level
2017        let message_type = "
2018    message m {
2019      REQUIRED group c (LIST) {
2020        REPEATED group list {
2021          REQUIRED INT32 element;
2022        }
2023      }
2024    }
2025    ";
2026        let schema = parse_message_type(message_type).expect("should parse schema");
2027        let descr = SchemaDescriptor::new(Arc::new(schema));
2028        // list(repeated)=1, element(required)=1
2029        assert_eq!(descr.column(0).max_def_level(), 1);
2030        assert_eq!(descr.column(0).max_rep_level(), 1);
2031        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 1);
2032
2033        // Nested lists: innermost REPEATED wins
2034        let message_type = "
2035    message m {
2036      OPTIONAL group outer (LIST) {
2037        REPEATED group list {
2038          OPTIONAL group inner (LIST) {
2039            REPEATED group list2 {
2040              OPTIONAL INT32 element;
2041            }
2042          }
2043        }
2044      }
2045    }
2046    ";
2047        let schema = parse_message_type(message_type).expect("should parse schema");
2048        let descr = SchemaDescriptor::new(Arc::new(schema));
2049        // outer(opt)=1, list(rep)=2, inner(opt)=3, list2(rep)=4, element(opt)=5
2050        assert_eq!(descr.column(0).max_def_level(), 5);
2051        assert_eq!(descr.column(0).max_rep_level(), 2);
2052        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 4);
2053
2054        // Struct inside list: all sibling leaves share the same repeated_ancestor_def_level
2055        let message_type = "
2056    message m {
2057      OPTIONAL group bag (LIST) {
2058        REPEATED group list {
2059          REQUIRED group item {
2060            OPTIONAL INT32 x;
2061            REQUIRED INT32 y;
2062          }
2063        }
2064      }
2065    }
2066    ";
2067        let schema = parse_message_type(message_type).expect("should parse schema");
2068        let descr = SchemaDescriptor::new(Arc::new(schema));
2069        // bag(opt)=1, list(rep)=2, item(req)=2, x(opt)=3
2070        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // bag.list.item.x
2071        // bag(opt)=1, list(rep)=2, item(req)=2, y(req)=2
2072        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // bag.list.item.y
2073
2074        // Map type: key (required) and value (optional) under the same REPEATED group
2075        let message_type = "
2076    message m {
2077      OPTIONAL group my_map (MAP) {
2078        REPEATED group key_value {
2079          REQUIRED BYTE_ARRAY key (UTF8);
2080          OPTIONAL INT32 value;
2081        }
2082      }
2083    }
2084    ";
2085        let schema = parse_message_type(message_type).expect("should parse schema");
2086        let descr = SchemaDescriptor::new(Arc::new(schema));
2087        // my_map(opt)=1, key_value(rep)=2, key(req)=2
2088        assert_eq!(descr.column(0).max_def_level(), 2);
2089        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // key: max_def == repeated_ancestor
2090        // my_map(opt)=1, key_value(rep)=2, value(opt)=3
2091        assert_eq!(descr.column(1).max_def_level(), 3);
2092        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // value: max_def > repeated_ancestor
2093    }
2094
2095    #[test]
2096    #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
2097    fn test_get_physical_type_panic() {
2098        let list = Type::group_type_builder("records")
2099            .with_repetition(Repetition::REPEATED)
2100            .build()
2101            .unwrap();
2102        list.get_physical_type();
2103    }
2104
2105    #[test]
2106    fn test_get_physical_type_primitive() {
2107        let f = Type::primitive_type_builder("f", PhysicalType::INT64)
2108            .build()
2109            .unwrap();
2110        assert_eq!(f.get_physical_type(), PhysicalType::INT64);
2111
2112        let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
2113            .build()
2114            .unwrap();
2115        assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
2116    }
2117
2118    #[test]
2119    fn test_check_contains_primitive_primitive() {
2120        // OK
2121        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2122            .build()
2123            .unwrap();
2124        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2125            .build()
2126            .unwrap();
2127        assert!(f1.check_contains(&f2));
2128
2129        // OK: different logical type does not affect check_contains
2130        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2131            .with_converted_type(ConvertedType::UINT_8)
2132            .build()
2133            .unwrap();
2134        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2135            .with_converted_type(ConvertedType::UINT_16)
2136            .build()
2137            .unwrap();
2138        assert!(f1.check_contains(&f2));
2139
2140        // KO: different name
2141        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2142            .build()
2143            .unwrap();
2144        let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
2145            .build()
2146            .unwrap();
2147        assert!(!f1.check_contains(&f2));
2148
2149        // KO: different type
2150        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2151            .build()
2152            .unwrap();
2153        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2154            .build()
2155            .unwrap();
2156        assert!(!f1.check_contains(&f2));
2157
2158        // KO: different repetition
2159        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2160            .with_repetition(Repetition::REQUIRED)
2161            .build()
2162            .unwrap();
2163        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2164            .with_repetition(Repetition::OPTIONAL)
2165            .build()
2166            .unwrap();
2167        assert!(!f1.check_contains(&f2));
2168    }
2169
2170    // function to create a new group type for testing
2171    fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
2172        Type::group_type_builder(name)
2173            .with_repetition(repetition)
2174            .with_fields(types.into_iter().map(Arc::new).collect())
2175            .build()
2176            .unwrap()
2177    }
2178
2179    #[test]
2180    fn test_check_contains_group_group() {
2181        // OK: should match okay with empty fields
2182        let f1 = Type::group_type_builder("f").build().unwrap();
2183        let f2 = Type::group_type_builder("f").build().unwrap();
2184        assert!(f1.check_contains(&f2));
2185        assert!(!f1.is_optional());
2186
2187        // OK: fields match
2188        let f1 = test_new_group_type(
2189            "f",
2190            Repetition::REPEATED,
2191            vec![
2192                Type::primitive_type_builder("f1", PhysicalType::INT32)
2193                    .build()
2194                    .unwrap(),
2195                Type::primitive_type_builder("f2", PhysicalType::INT64)
2196                    .build()
2197                    .unwrap(),
2198            ],
2199        );
2200        let f2 = test_new_group_type(
2201            "f",
2202            Repetition::REPEATED,
2203            vec![
2204                Type::primitive_type_builder("f1", PhysicalType::INT32)
2205                    .build()
2206                    .unwrap(),
2207                Type::primitive_type_builder("f2", PhysicalType::INT64)
2208                    .build()
2209                    .unwrap(),
2210            ],
2211        );
2212        assert!(f1.check_contains(&f2));
2213
2214        // OK: subset of fields
2215        let f1 = test_new_group_type(
2216            "f",
2217            Repetition::REPEATED,
2218            vec![
2219                Type::primitive_type_builder("f1", PhysicalType::INT32)
2220                    .build()
2221                    .unwrap(),
2222                Type::primitive_type_builder("f2", PhysicalType::INT64)
2223                    .build()
2224                    .unwrap(),
2225            ],
2226        );
2227        let f2 = test_new_group_type(
2228            "f",
2229            Repetition::REPEATED,
2230            vec![
2231                Type::primitive_type_builder("f2", PhysicalType::INT64)
2232                    .build()
2233                    .unwrap(),
2234            ],
2235        );
2236        assert!(f1.check_contains(&f2));
2237
2238        // KO: different name
2239        let f1 = Type::group_type_builder("f1").build().unwrap();
2240        let f2 = Type::group_type_builder("f2").build().unwrap();
2241        assert!(!f1.check_contains(&f2));
2242
2243        // KO: different repetition
2244        let f1 = Type::group_type_builder("f")
2245            .with_repetition(Repetition::OPTIONAL)
2246            .build()
2247            .unwrap();
2248        let f2 = Type::group_type_builder("f")
2249            .with_repetition(Repetition::REPEATED)
2250            .build()
2251            .unwrap();
2252        assert!(!f1.check_contains(&f2));
2253
2254        // KO: different fields
2255        let f1 = test_new_group_type(
2256            "f",
2257            Repetition::REPEATED,
2258            vec![
2259                Type::primitive_type_builder("f1", PhysicalType::INT32)
2260                    .build()
2261                    .unwrap(),
2262                Type::primitive_type_builder("f2", PhysicalType::INT64)
2263                    .build()
2264                    .unwrap(),
2265            ],
2266        );
2267        let f2 = test_new_group_type(
2268            "f",
2269            Repetition::REPEATED,
2270            vec![
2271                Type::primitive_type_builder("f1", PhysicalType::INT32)
2272                    .build()
2273                    .unwrap(),
2274                Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2275                    .build()
2276                    .unwrap(),
2277            ],
2278        );
2279        assert!(!f1.check_contains(&f2));
2280
2281        // KO: different fields
2282        let f1 = test_new_group_type(
2283            "f",
2284            Repetition::REPEATED,
2285            vec![
2286                Type::primitive_type_builder("f1", PhysicalType::INT32)
2287                    .build()
2288                    .unwrap(),
2289                Type::primitive_type_builder("f2", PhysicalType::INT64)
2290                    .build()
2291                    .unwrap(),
2292            ],
2293        );
2294        let f2 = test_new_group_type(
2295            "f",
2296            Repetition::REPEATED,
2297            vec![
2298                Type::primitive_type_builder("f3", PhysicalType::INT32)
2299                    .build()
2300                    .unwrap(),
2301            ],
2302        );
2303        assert!(!f1.check_contains(&f2));
2304    }
2305
2306    #[test]
2307    fn test_check_contains_group_primitive() {
2308        // KO: should not match
2309        let f1 = Type::group_type_builder("f").build().unwrap();
2310        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2311            .build()
2312            .unwrap();
2313        assert!(!f1.check_contains(&f2));
2314        assert!(!f2.check_contains(&f1));
2315
2316        // KO: should not match when primitive field is part of group type
2317        let f1 = test_new_group_type(
2318            "f",
2319            Repetition::REPEATED,
2320            vec![
2321                Type::primitive_type_builder("f1", PhysicalType::INT32)
2322                    .build()
2323                    .unwrap(),
2324            ],
2325        );
2326        let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2327            .build()
2328            .unwrap();
2329        assert!(!f1.check_contains(&f2));
2330        assert!(!f2.check_contains(&f1));
2331
2332        // OK: match nested types
2333        let f1 = test_new_group_type(
2334            "a",
2335            Repetition::REPEATED,
2336            vec![
2337                test_new_group_type(
2338                    "b",
2339                    Repetition::REPEATED,
2340                    vec![
2341                        Type::primitive_type_builder("c", PhysicalType::INT32)
2342                            .build()
2343                            .unwrap(),
2344                    ],
2345                ),
2346                Type::primitive_type_builder("d", PhysicalType::INT64)
2347                    .build()
2348                    .unwrap(),
2349                Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2350                    .build()
2351                    .unwrap(),
2352            ],
2353        );
2354        let f2 = test_new_group_type(
2355            "a",
2356            Repetition::REPEATED,
2357            vec![test_new_group_type(
2358                "b",
2359                Repetition::REPEATED,
2360                vec![
2361                    Type::primitive_type_builder("c", PhysicalType::INT32)
2362                        .build()
2363                        .unwrap(),
2364                ],
2365            )],
2366        );
2367        assert!(f1.check_contains(&f2)); // should match
2368        assert!(!f2.check_contains(&f1)); // should fail
2369    }
2370
2371    #[test]
2372    fn test_schema_type_thrift_conversion_err() {
2373        let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2374            .build()
2375            .unwrap();
2376        let schema = Arc::new(schema);
2377        let thrift_schema = schema_to_buf(&schema);
2378        assert!(thrift_schema.is_err());
2379        if let Err(e) = thrift_schema {
2380            assert_eq!(
2381                format!("{e}"),
2382                "Parquet error: Root schema must be Group type"
2383            );
2384        }
2385    }
2386
2387    #[test]
2388    fn test_schema_type_thrift_conversion() {
2389        let message_type = "
2390    message conversions {
2391      REQUIRED INT64 id;
2392      OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2393      OPTIONAL group int_array_Array (LIST) {
2394        REPEATED group list {
2395          OPTIONAL group element (LIST) {
2396            REPEATED group list {
2397              OPTIONAL INT32 element;
2398            }
2399          }
2400        }
2401      }
2402      OPTIONAL group int_map (MAP) {
2403        REPEATED group map (MAP_KEY_VALUE) {
2404          REQUIRED BYTE_ARRAY key (UTF8);
2405          OPTIONAL INT32 value;
2406        }
2407      }
2408      OPTIONAL group int_Map_Array (LIST) {
2409        REPEATED group list {
2410          OPTIONAL group g (MAP) {
2411            REPEATED group map (MAP_KEY_VALUE) {
2412              REQUIRED BYTE_ARRAY key (UTF8);
2413              OPTIONAL group value {
2414                OPTIONAL group H {
2415                  OPTIONAL group i (LIST) {
2416                    REPEATED group list {
2417                      OPTIONAL DOUBLE element;
2418                    }
2419                  }
2420                }
2421              }
2422            }
2423          }
2424        }
2425      }
2426      OPTIONAL group nested_struct {
2427        OPTIONAL INT32 A;
2428        OPTIONAL group b (LIST) {
2429          REPEATED group list {
2430            REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2431          }
2432        }
2433      }
2434    }
2435    ";
2436        let expected_schema = parse_message_type(message_type).unwrap();
2437        let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap();
2438        assert_eq!(result_schema, Arc::new(expected_schema));
2439    }
2440
2441    #[test]
2442    fn test_schema_type_thrift_conversion_decimal() {
2443        let message_type = "
2444    message decimals {
2445      OPTIONAL INT32 field0;
2446      OPTIONAL INT64 field1 (DECIMAL (18, 2));
2447      OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2448      OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2449    }
2450    ";
2451        let expected_schema = parse_message_type(message_type).unwrap();
2452        let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap();
2453        assert_eq!(result_schema, Arc::new(expected_schema));
2454    }
2455
2456    // Tests schema conversion from thrift, when num_children is set to Some(0) for a
2457    // primitive type.
2458    #[test]
2459    fn test_schema_from_thrift_with_num_children_set() {
2460        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2461        let message_type = "
2462    message schema {
2463      OPTIONAL BYTE_ARRAY id (UTF8);
2464      OPTIONAL BYTE_ARRAY name (UTF8);
2465      OPTIONAL BYTE_ARRAY message (UTF8);
2466      OPTIONAL INT32 type (UINT_8);
2467      OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2468      OPTIONAL INT64 __index_level_0__;
2469    }
2470    ";
2471
2472        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2473        let mut buf = schema_to_buf(&expected_schema).unwrap();
2474        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2475
2476        // Change all of None to Some(0)
2477        for elem in &mut thrift_schema[..] {
2478            if elem.num_children.is_none() {
2479                elem.num_children = Some(0);
2480            }
2481        }
2482
2483        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2484        assert_eq!(result_schema, expected_schema);
2485    }
2486
2487    // Sometimes parquet-cpp sets repetition level for the root node, which is against
2488    // the format definition, but we need to handle it by setting it back to None.
2489    #[test]
2490    fn test_schema_from_thrift_root_has_repetition() {
2491        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2492        let message_type = "
2493    message schema {
2494      OPTIONAL BYTE_ARRAY a (UTF8);
2495      OPTIONAL INT32 b (UINT_8);
2496    }
2497    ";
2498
2499        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2500        let mut buf = schema_to_buf(&expected_schema).unwrap();
2501        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2502        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED);
2503
2504        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2505        assert_eq!(result_schema, expected_schema);
2506    }
2507
2508    #[test]
2509    fn test_schema_from_thrift_group_has_no_child() {
2510        let message_type = "message schema {}";
2511
2512        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2513        let mut buf = schema_to_buf(&expected_schema).unwrap();
2514        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2515        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED);
2516
2517        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2518        assert_eq!(result_schema, expected_schema);
2519    }
2520}