Skip to main content

parquet/schema/
types.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains structs and methods to build Parquet schema and schema descriptors.
19
20use std::vec::IntoIter;
21use std::{collections::HashMap, fmt, sync::Arc};
22
23use crate::file::metadata::HeapSize;
24use crate::file::metadata::thrift::SchemaElement;
25
26use crate::basic::{
27    ColumnOrder, ConvertedType, IntType, LogicalType, Repetition, SortOrder, TimeType, TimeUnit,
28    Type as PhysicalType,
29};
30use crate::errors::{ParquetError, Result};
31
32// ----------------------------------------------------------------------
33// Parquet Type definitions
34
35/// Type alias for `Arc<Type>`.
36pub type TypePtr = Arc<Type>;
37/// Type alias for `Arc<SchemaDescriptor>`.
38pub type SchemaDescPtr = Arc<SchemaDescriptor>;
39/// Type alias for `Arc<ColumnDescriptor>`.
40pub type ColumnDescPtr = Arc<ColumnDescriptor>;
41
42/// Representation of a Parquet type.
43///
44/// Used to describe primitive leaf fields and structs, including top-level schema.
45///
46/// Note that the top-level schema is represented using [`Type::GroupType`] whose
47/// repetition is `None`.
48#[derive(Clone, Debug, PartialEq)]
49pub enum Type {
50    /// Represents a primitive leaf field.
51    PrimitiveType {
52        /// Basic information about the type.
53        basic_info: BasicTypeInfo,
54        /// Physical type of this primitive type.
55        physical_type: PhysicalType,
56        /// Length of this type.
57        type_length: i32,
58        /// Scale of this type.
59        scale: i32,
60        /// Precision of this type.
61        precision: i32,
62    },
63    /// Represents a group of fields (similar to struct).
64    GroupType {
65        /// Basic information about the type.
66        basic_info: BasicTypeInfo,
67        /// Fields of this group type.
68        fields: Vec<TypePtr>,
69    },
70}
71
72impl HeapSize for Type {
73    fn heap_size(&self) -> usize {
74        match self {
75            Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
76            Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
77        }
78    }
79}
80
81impl Type {
82    /// Creates primitive type builder with provided field name and physical type.
83    pub fn primitive_type_builder(
84        name: &str,
85        physical_type: PhysicalType,
86    ) -> PrimitiveTypeBuilder<'_> {
87        PrimitiveTypeBuilder::new(name, physical_type)
88    }
89
90    /// Creates group type builder with provided column name.
91    pub fn group_type_builder(name: &str) -> GroupTypeBuilder<'_> {
92        GroupTypeBuilder::new(name)
93    }
94
95    /// Returns [`BasicTypeInfo`] information about the type.
96    pub fn get_basic_info(&self) -> &BasicTypeInfo {
97        match *self {
98            Type::PrimitiveType { ref basic_info, .. } => basic_info,
99            Type::GroupType { ref basic_info, .. } => basic_info,
100        }
101    }
102
103    /// Returns this type's field name.
104    pub fn name(&self) -> &str {
105        self.get_basic_info().name()
106    }
107
108    /// Gets the fields from this group type.
109    /// Note that this will panic if called on a non-group type.
110    // TODO: should we return `&[&Type]` here?
111    pub fn get_fields(&self) -> &[TypePtr] {
112        match *self {
113            Type::GroupType { ref fields, .. } => &fields[..],
114            _ => panic!("Cannot call get_fields() on a non-group type"),
115        }
116    }
117
118    /// Gets physical type of this primitive type.
119    /// Note that this will panic if called on a non-primitive type.
120    pub fn get_physical_type(&self) -> PhysicalType {
121        match *self {
122            Type::PrimitiveType {
123                basic_info: _,
124                physical_type,
125                ..
126            } => physical_type,
127            _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
128        }
129    }
130
131    /// Gets precision of this primitive type.
132    /// Note that this will panic if called on a non-primitive type.
133    pub fn get_precision(&self) -> i32 {
134        match *self {
135            Type::PrimitiveType { precision, .. } => precision,
136            _ => panic!("Cannot call get_precision() on non-primitive type"),
137        }
138    }
139
140    /// Gets scale of this primitive type.
141    /// Note that this will panic if called on a non-primitive type.
142    pub fn get_scale(&self) -> i32 {
143        match *self {
144            Type::PrimitiveType { scale, .. } => scale,
145            _ => panic!("Cannot call get_scale() on non-primitive type"),
146        }
147    }
148
149    /// Checks if `sub_type` schema is part of current schema.
150    /// This method can be used to check if projected columns are part of the root schema.
151    pub fn check_contains(&self, sub_type: &Type) -> bool {
152        // Names match, and repetitions match or not set for both
153        let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
154            && (self.is_schema() && sub_type.is_schema()
155                || !self.is_schema()
156                    && !sub_type.is_schema()
157                    && self.get_basic_info().repetition()
158                        == sub_type.get_basic_info().repetition());
159
160        match *self {
161            Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
162                self.get_physical_type() == sub_type.get_physical_type()
163            }
164            Type::GroupType { .. } if basic_match && sub_type.is_group() => {
165                // build hashmap of name -> TypePtr
166                let mut field_map = HashMap::new();
167                for field in self.get_fields() {
168                    field_map.insert(field.name(), field);
169                }
170
171                for field in sub_type.get_fields() {
172                    if !field_map
173                        .get(field.name())
174                        .map(|tpe| tpe.check_contains(field))
175                        .unwrap_or(false)
176                    {
177                        return false;
178                    }
179                }
180                true
181            }
182            _ => false,
183        }
184    }
185
186    /// Returns `true` if this type is a primitive type, `false` otherwise.
187    pub fn is_primitive(&self) -> bool {
188        matches!(*self, Type::PrimitiveType { .. })
189    }
190
191    /// Returns `true` if this type is a group type, `false` otherwise.
192    pub fn is_group(&self) -> bool {
193        matches!(*self, Type::GroupType { .. })
194    }
195
196    /// Returns `true` if this type is the top-level schema type (message type).
197    pub fn is_schema(&self) -> bool {
198        match *self {
199            Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
200            _ => false,
201        }
202    }
203
204    /// Returns `true` if this type is repeated or optional.
205    /// If this type doesn't have repetition defined, we treat it as required.
206    pub fn is_optional(&self) -> bool {
207        self.get_basic_info().has_repetition()
208            && self.get_basic_info().repetition() != Repetition::REQUIRED
209    }
210
211    /// Returns `true` if this type is annotated as a list.
212    pub(crate) fn is_list(&self) -> bool {
213        if self.is_group() {
214            let basic_info = self.get_basic_info();
215            if let Some(logical_type) = basic_info.logical_type_ref() {
216                return logical_type == &LogicalType::List;
217            }
218            return basic_info.converted_type() == ConvertedType::LIST;
219        }
220        false
221    }
222
223    /// Returns `true` if this type is a group with a single child field that is `repeated`.
224    pub(crate) fn has_single_repeated_child(&self) -> bool {
225        if self.is_group() {
226            let children = self.get_fields();
227            return children.len() == 1
228                && children[0].get_basic_info().has_repetition()
229                && children[0].get_basic_info().repetition() == Repetition::REPEATED;
230        }
231        false
232    }
233}
234
235/// A builder for primitive types. All attributes are optional
236/// except the name and physical type.
237/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used.
238pub struct PrimitiveTypeBuilder<'a> {
239    name: &'a str,
240    repetition: Repetition,
241    physical_type: PhysicalType,
242    converted_type: ConvertedType,
243    logical_type: Option<LogicalType>,
244    length: i32,
245    precision: i32,
246    scale: i32,
247    id: Option<i32>,
248}
249
250impl<'a> PrimitiveTypeBuilder<'a> {
251    /// Creates new primitive type builder with provided field name and physical type.
252    pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
253        Self {
254            name,
255            repetition: Repetition::OPTIONAL,
256            physical_type,
257            converted_type: ConvertedType::NONE,
258            logical_type: None,
259            length: -1,
260            precision: -1,
261            scale: -1,
262            id: None,
263        }
264    }
265
266    /// Sets [`Repetition`] for this field and returns itself.
267    pub fn with_repetition(self, repetition: Repetition) -> Self {
268        Self { repetition, ..self }
269    }
270
271    /// Sets [`ConvertedType`] for this field and returns itself.
272    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
273        Self {
274            converted_type,
275            ..self
276        }
277    }
278
279    /// Sets [`LogicalType`] for this field and returns itself.
280    /// If only the logical type is populated for a primitive type, the converted type
281    /// will be automatically populated, and can thus be omitted.
282    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
283        Self {
284            logical_type,
285            ..self
286        }
287    }
288
289    /// Sets type length and returns itself.
290    /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because
291    /// they maintain fixed size underlying byte array.
292    /// By default, value is `0`.
293    pub fn with_length(self, length: i32) -> Self {
294        Self { length, ..self }
295    }
296
297    /// Sets precision for Parquet DECIMAL physical type and returns itself.
298    /// By default, it equals to `0` and used only for decimal context.
299    pub fn with_precision(self, precision: i32) -> Self {
300        Self { precision, ..self }
301    }
302
303    /// Sets scale for Parquet DECIMAL physical type and returns itself.
304    /// By default, it equals to `0` and used only for decimal context.
305    pub fn with_scale(self, scale: i32) -> Self {
306        Self { scale, ..self }
307    }
308
309    /// Sets optional field id and returns itself.
310    pub fn with_id(self, id: Option<i32>) -> Self {
311        Self { id, ..self }
312    }
313
314    /// Creates a new `PrimitiveType` instance from the collected attributes.
315    /// Returns `Err` in case of any building conditions are not met.
316    pub fn build(self) -> Result<Type> {
317        let mut basic_info = BasicTypeInfo {
318            name: String::from(self.name),
319            repetition: Some(self.repetition),
320            converted_type: self.converted_type,
321            logical_type: self.logical_type.clone(),
322            id: self.id,
323        };
324
325        // Check length before logical type, since it is used for logical type validation.
326        if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
327            return Err(general_err!(
328                "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
329                self.length,
330                self.name
331            ));
332        }
333
334        if let Some(logical_type) = &self.logical_type {
335            // If a converted type is populated, check that it is consistent with
336            // its logical type
337            if self.converted_type != ConvertedType::NONE {
338                if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
339                    return Err(general_err!(
340                        "Logical type {:?} is incompatible with converted type {} for field '{}'",
341                        logical_type,
342                        self.converted_type,
343                        self.name
344                    ));
345                }
346            } else {
347                // Populate the converted type for backwards compatibility
348                basic_info.converted_type = self.logical_type.clone().into();
349            }
350            // Check that logical type and physical type are compatible
351            match (logical_type, self.physical_type) {
352                (LogicalType::Map, _) | (LogicalType::List, _) => {
353                    return Err(general_err!(
354                        "{:?} cannot be applied to a primitive type for field '{}'",
355                        logical_type,
356                        self.name
357                    ));
358                }
359                (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
360                (LogicalType::Decimal(decimal), _) => {
361                    // Check that scale and precision are consistent with legacy values
362                    if decimal.scale != self.scale {
363                        return Err(general_err!(
364                            "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
365                            decimal.scale,
366                            self.scale,
367                            self.name
368                        ));
369                    }
370                    if decimal.precision != self.precision {
371                        return Err(general_err!(
372                            "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
373                            decimal.precision,
374                            self.precision,
375                            self.name
376                        ));
377                    }
378                    self.check_decimal_precision_scale()?;
379                }
380                (LogicalType::Date, PhysicalType::INT32) => {}
381                (
382                    LogicalType::Time(TimeType {
383                        unit: TimeUnit::MILLIS,
384                        ..
385                    }),
386                    PhysicalType::INT32,
387                ) => {}
388                (LogicalType::Time(time), PhysicalType::INT64) => {
389                    if time.unit == TimeUnit::MILLIS {
390                        return Err(general_err!(
391                            "Cannot use millisecond unit on INT64 type for field '{}'",
392                            self.name
393                        ));
394                    }
395                }
396                (LogicalType::Timestamp(_), PhysicalType::INT64) => {}
397                (LogicalType::Integer(int), PhysicalType::INT32) if int.bit_width <= 32 => {}
398                (LogicalType::Integer(int), PhysicalType::INT64) if int.bit_width == 64 => {}
399                // Null type
400                (LogicalType::Unknown, _) => {}
401                (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
402                (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
403                (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
404                (LogicalType::Geometry(_), PhysicalType::BYTE_ARRAY) => {}
405                (LogicalType::Geography(_), PhysicalType::BYTE_ARRAY) => {}
406                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
407                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
408                    return Err(general_err!(
409                        "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
410                        self.name
411                    ));
412                }
413                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 2 => {}
414                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
415                    return Err(general_err!(
416                        "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
417                        self.name
418                    ));
419                }
420                // unknown logical type means just use physical type
421                (LogicalType::_Unknown { .. }, _) => {}
422                (a, b) => {
423                    return Err(general_err!(
424                        "Cannot annotate {:?} from {} for field '{}'",
425                        a,
426                        b,
427                        self.name
428                    ));
429                }
430            }
431        }
432
433        match self.converted_type {
434            ConvertedType::NONE => {}
435            ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
436                if self.physical_type != PhysicalType::BYTE_ARRAY {
437                    return Err(general_err!(
438                        "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
439                        self.converted_type,
440                        self.name
441                    ));
442                }
443            }
444            ConvertedType::DECIMAL => {
445                self.check_decimal_precision_scale()?;
446            }
447            ConvertedType::DATE
448            | ConvertedType::TIME_MILLIS
449            | ConvertedType::UINT_8
450            | ConvertedType::UINT_16
451            | ConvertedType::UINT_32
452            | ConvertedType::INT_8
453            | ConvertedType::INT_16
454            | ConvertedType::INT_32 => {
455                if self.physical_type != PhysicalType::INT32 {
456                    return Err(general_err!(
457                        "{} cannot annotate field '{}' because it is not a INT32 field",
458                        self.converted_type,
459                        self.name
460                    ));
461                }
462            }
463            ConvertedType::TIME_MICROS
464            | ConvertedType::TIMESTAMP_MILLIS
465            | ConvertedType::TIMESTAMP_MICROS
466            | ConvertedType::UINT_64
467            | ConvertedType::INT_64 => {
468                if self.physical_type != PhysicalType::INT64 {
469                    return Err(general_err!(
470                        "{} cannot annotate field '{}' because it is not a INT64 field",
471                        self.converted_type,
472                        self.name
473                    ));
474                }
475            }
476            ConvertedType::INTERVAL => {
477                if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
478                    return Err(general_err!(
479                        "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
480                        self.name
481                    ));
482                }
483            }
484            ConvertedType::ENUM => {
485                if self.physical_type != PhysicalType::BYTE_ARRAY {
486                    return Err(general_err!(
487                        "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
488                        self.name
489                    ));
490                }
491            }
492            _ => {
493                return Err(general_err!(
494                    "{} cannot be applied to primitive field '{}'",
495                    self.converted_type,
496                    self.name
497                ));
498            }
499        }
500
501        Ok(Type::PrimitiveType {
502            basic_info,
503            physical_type: self.physical_type,
504            type_length: self.length,
505            scale: self.scale,
506            precision: self.precision,
507        })
508    }
509
510    #[inline]
511    fn check_decimal_precision_scale(&self) -> Result<()> {
512        match self.physical_type {
513            PhysicalType::INT32
514            | PhysicalType::INT64
515            | PhysicalType::BYTE_ARRAY
516            | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
517            _ => {
518                return Err(general_err!(
519                    "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
520                ));
521            }
522        }
523
524        // Precision is required and must be a non-zero positive integer.
525        if self.precision < 1 {
526            return Err(general_err!(
527                "Invalid DECIMAL precision: {}",
528                self.precision
529            ));
530        }
531
532        // Scale must be zero or a positive integer less than the precision.
533        if self.scale < 0 {
534            return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
535        }
536
537        if self.scale > self.precision {
538            return Err(general_err!(
539                "Invalid DECIMAL: scale ({}) cannot be greater than precision \
540             ({})",
541                self.scale,
542                self.precision
543            ));
544        }
545
546        // Check precision and scale based on physical type limitations.
547        match self.physical_type {
548            PhysicalType::INT32 => {
549                if self.precision > 9 {
550                    return Err(general_err!(
551                        "Cannot represent INT32 as DECIMAL with precision {}",
552                        self.precision
553                    ));
554                }
555            }
556            PhysicalType::INT64 => {
557                if self.precision > 18 {
558                    return Err(general_err!(
559                        "Cannot represent INT64 as DECIMAL with precision {}",
560                        self.precision
561                    ));
562                }
563            }
564            PhysicalType::FIXED_LEN_BYTE_ARRAY => {
565                let length = self
566                    .length
567                    .checked_mul(8)
568                    .ok_or(general_err!("Invalid length {} for Decimal", self.length))?;
569                let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32;
570
571                if self.precision > max_precision {
572                    return Err(general_err!(
573                        "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
574                        precision {}. The max precision can only be {}",
575                        self.length,
576                        self.precision,
577                        max_precision
578                    ));
579                }
580            }
581            _ => (), // For BYTE_ARRAY precision is not limited
582        }
583
584        Ok(())
585    }
586}
587
588/// A builder for group types. All attributes are optional except the name.
589/// Note that if not specified explicitly, `None` is used as the repetition of the group,
590/// which means it is a root (message) type.
591pub struct GroupTypeBuilder<'a> {
592    name: &'a str,
593    repetition: Option<Repetition>,
594    converted_type: ConvertedType,
595    logical_type: Option<LogicalType>,
596    fields: Vec<TypePtr>,
597    id: Option<i32>,
598}
599
600impl<'a> GroupTypeBuilder<'a> {
601    /// Creates new group type builder with provided field name.
602    pub fn new(name: &'a str) -> Self {
603        Self {
604            name,
605            repetition: None,
606            converted_type: ConvertedType::NONE,
607            logical_type: None,
608            fields: Vec::new(),
609            id: None,
610        }
611    }
612
613    /// Sets [`Repetition`] for this field and returns itself.
614    pub fn with_repetition(mut self, repetition: Repetition) -> Self {
615        self.repetition = Some(repetition);
616        self
617    }
618
619    /// Sets [`ConvertedType`] for this field and returns itself.
620    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
621        Self {
622            converted_type,
623            ..self
624        }
625    }
626
627    /// Sets [`LogicalType`] for this field and returns itself.
628    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
629        Self {
630            logical_type,
631            ..self
632        }
633    }
634
635    /// Sets a list of fields that should be child nodes of this field.
636    /// Returns updated self.
637    pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
638        Self { fields, ..self }
639    }
640
641    /// Sets optional field id and returns itself.
642    pub fn with_id(self, id: Option<i32>) -> Self {
643        Self { id, ..self }
644    }
645
646    /// Creates a new `GroupType` instance from the gathered attributes.
647    pub fn build(self) -> Result<Type> {
648        let mut basic_info = BasicTypeInfo {
649            name: String::from(self.name),
650            repetition: self.repetition,
651            converted_type: self.converted_type,
652            logical_type: self.logical_type.clone(),
653            id: self.id,
654        };
655        // Populate the converted type if only the logical type is populated
656        if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
657            basic_info.converted_type = self.logical_type.into();
658        }
659        Ok(Type::GroupType {
660            basic_info,
661            fields: self.fields,
662        })
663    }
664}
665
666/// Basic type info. This contains information such as the name of the type,
667/// the repetition level, the logical type and the kind of the type (group, primitive).
668#[derive(Clone, Debug, PartialEq, Eq)]
669pub struct BasicTypeInfo {
670    name: String,
671    repetition: Option<Repetition>,
672    converted_type: ConvertedType,
673    logical_type: Option<LogicalType>,
674    id: Option<i32>,
675}
676
677impl HeapSize for BasicTypeInfo {
678    fn heap_size(&self) -> usize {
679        // no heap allocations in any other subfield
680        self.name.heap_size()
681    }
682}
683
684impl BasicTypeInfo {
685    /// Returns field name.
686    pub fn name(&self) -> &str {
687        &self.name
688    }
689
690    /// Returns `true` if type has repetition field set, `false` otherwise.
691    /// This is mostly applied to group type, because primitive type always has
692    /// repetition set.
693    pub fn has_repetition(&self) -> bool {
694        self.repetition.is_some()
695    }
696
697    /// Returns [`Repetition`] value for the type.
698    pub fn repetition(&self) -> Repetition {
699        assert!(self.repetition.is_some());
700        self.repetition.unwrap()
701    }
702
703    /// Returns [`ConvertedType`] value for the type.
704    pub fn converted_type(&self) -> ConvertedType {
705        self.converted_type
706    }
707
708    /// Returns [`LogicalType`] value for the type.
709    ///
710    /// Note that this function will clone the `LogicalType`. If performance is a concern,
711    /// use [`Self::logical_type_ref`] instead.
712    #[deprecated(
713        since = "57.1.0",
714        note = "use `BasicTypeInfo::logical_type_ref` instead (LogicalType cloning is non trivial)"
715    )]
716    pub fn logical_type(&self) -> Option<LogicalType> {
717        // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it
718        self.logical_type.clone()
719    }
720
721    /// Return a reference to the [`LogicalType`] value for the type.
722    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
723        self.logical_type.as_ref()
724    }
725
726    /// Returns `true` if id is set, `false` otherwise.
727    pub fn has_id(&self) -> bool {
728        self.id.is_some()
729    }
730
731    /// Returns id value for the type.
732    pub fn id(&self) -> i32 {
733        assert!(self.id.is_some());
734        self.id.unwrap()
735    }
736}
737
738// ----------------------------------------------------------------------
739// Parquet descriptor definitions
740
741/// Represents the location of a column in a Parquet schema
742///
743/// # Example: refer to column named `'my_column'`
744/// ```
745/// # use parquet::schema::types::ColumnPath;
746/// let column_path = ColumnPath::from("my_column");
747/// ```
748///
749/// # Example: refer to column named `c` in a nested struct `{a: {b: {c: ...}}}`
750/// ```
751/// # use parquet::schema::types::ColumnPath;
752/// // form path 'a.b.c'
753/// let column_path = ColumnPath::from(vec![
754///   String::from("a"),
755///   String::from("b"),
756///   String::from("c")
757/// ]);
758/// ```
759#[derive(Clone, PartialEq, Debug, Eq, Hash)]
760pub struct ColumnPath {
761    parts: Vec<String>,
762}
763
764impl HeapSize for ColumnPath {
765    fn heap_size(&self) -> usize {
766        self.parts.heap_size()
767    }
768}
769
770impl ColumnPath {
771    /// Creates new column path from vector of field names.
772    pub fn new(parts: Vec<String>) -> Self {
773        ColumnPath { parts }
774    }
775
776    /// Returns string representation of this column path.
777    /// ```rust
778    /// use parquet::schema::types::ColumnPath;
779    ///
780    /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
781    /// assert_eq!(&path.string(), "a.b.c");
782    /// ```
783    pub fn string(&self) -> String {
784        self.parts.join(".")
785    }
786
787    /// Appends more components to end of column path.
788    /// ```rust
789    /// use parquet::schema::types::ColumnPath;
790    ///
791    /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c"
792    /// .to_string()]);
793    /// assert_eq!(&path.string(), "a.b.c");
794    ///
795    /// path.append(vec!["d".to_string(), "e".to_string()]);
796    /// assert_eq!(&path.string(), "a.b.c.d.e");
797    /// ```
798    pub fn append(&mut self, mut tail: Vec<String>) {
799        self.parts.append(&mut tail);
800    }
801
802    /// Returns a slice of path components.
803    pub fn parts(&self) -> &[String] {
804        &self.parts
805    }
806}
807
808impl fmt::Display for ColumnPath {
809    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
810        write!(f, "{:?}", self.string())
811    }
812}
813
814impl From<Vec<String>> for ColumnPath {
815    fn from(parts: Vec<String>) -> Self {
816        ColumnPath { parts }
817    }
818}
819
820impl From<&str> for ColumnPath {
821    fn from(single_path: &str) -> Self {
822        let s = String::from(single_path);
823        ColumnPath::from(s)
824    }
825}
826
827impl From<String> for ColumnPath {
828    fn from(single_path: String) -> Self {
829        let v = vec![single_path];
830        ColumnPath { parts: v }
831    }
832}
833
834impl AsRef<[String]> for ColumnPath {
835    fn as_ref(&self) -> &[String] {
836        &self.parts
837    }
838}
839
840/// Physical type for leaf-level primitive columns.
841///
842/// Also includes the maximum definition and repetition levels required to
843/// re-assemble nested data.
844#[derive(Debug, PartialEq)]
845pub struct ColumnDescriptor {
846    /// The "leaf" primitive type of this column
847    primitive_type: TypePtr,
848
849    /// The maximum definition level for this column
850    max_def_level: i16,
851
852    /// The maximum repetition level for this column
853    max_rep_level: i16,
854
855    /// The definition level at the nearest REPEATED ancestor, or 0 if none.
856    repeated_ancestor_def_level: i16,
857
858    /// The path of this column. For instance, "a.b.c.d".
859    path: ColumnPath,
860}
861
862impl HeapSize for ColumnDescriptor {
863    fn heap_size(&self) -> usize {
864        // Don't include the heap size of primitive_type, this is already
865        // accounted for via SchemaDescriptor::schema
866        self.path.heap_size()
867    }
868}
869
870impl ColumnDescriptor {
871    /// Creates new descriptor for leaf-level column.
872    pub fn new(
873        primitive_type: TypePtr,
874        max_def_level: i16,
875        max_rep_level: i16,
876        path: ColumnPath,
877    ) -> Self {
878        Self::new_with_repeated_ancestor(primitive_type, max_def_level, max_rep_level, path, 0)
879    }
880
881    pub(crate) fn new_with_repeated_ancestor(
882        primitive_type: TypePtr,
883        max_def_level: i16,
884        max_rep_level: i16,
885        path: ColumnPath,
886        repeated_ancestor_def_level: i16,
887    ) -> Self {
888        Self {
889            primitive_type,
890            max_def_level,
891            max_rep_level,
892            repeated_ancestor_def_level,
893            path,
894        }
895    }
896
897    /// Returns maximum definition level for this column.
898    #[inline]
899    pub fn max_def_level(&self) -> i16 {
900        self.max_def_level
901    }
902
903    /// Returns maximum repetition level for this column.
904    #[inline]
905    pub fn max_rep_level(&self) -> i16 {
906        self.max_rep_level
907    }
908
909    /// Returns the definition level at the nearest REPEATED ancestor, or 0 if none.
910    #[inline]
911    pub fn repeated_ancestor_def_level(&self) -> i16 {
912        self.repeated_ancestor_def_level
913    }
914
915    /// Returns [`ColumnPath`] for this column.
916    pub fn path(&self) -> &ColumnPath {
917        &self.path
918    }
919
920    /// Returns self type [`Type`] for this leaf column.
921    pub fn self_type(&self) -> &Type {
922        self.primitive_type.as_ref()
923    }
924
925    /// Returns self type [`TypePtr`]  for this leaf
926    /// column.
927    pub fn self_type_ptr(&self) -> TypePtr {
928        self.primitive_type.clone()
929    }
930
931    /// Returns column name.
932    pub fn name(&self) -> &str {
933        self.primitive_type.name()
934    }
935
936    /// Returns [`ConvertedType`] for this column.
937    pub fn converted_type(&self) -> ConvertedType {
938        self.primitive_type.get_basic_info().converted_type()
939    }
940
941    /// Returns [`LogicalType`] for this column.
942    ///
943    /// Note that this function will clone the `LogicalType`. If performance is a concern,
944    /// use [`Self::logical_type_ref`] instead.
945    #[deprecated(
946        since = "57.1.0",
947        note = "use `ColumnDescriptor::logical_type_ref` instead (LogicalType cloning is non trivial)"
948    )]
949    pub fn logical_type(&self) -> Option<LogicalType> {
950        self.primitive_type
951            .get_basic_info()
952            .logical_type_ref()
953            .cloned()
954    }
955
956    /// Returns a reference to the [`LogicalType`] for this column.
957    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
958        self.primitive_type.get_basic_info().logical_type_ref()
959    }
960
961    /// Returns physical type for this column.
962    /// Note that it will panic if called on a non-primitive type.
963    pub fn physical_type(&self) -> PhysicalType {
964        match self.primitive_type.as_ref() {
965            Type::PrimitiveType { physical_type, .. } => *physical_type,
966            _ => panic!("Expected primitive type!"),
967        }
968    }
969
970    /// Returns type length for this column.
971    /// Note that it will panic if called on a non-primitive type.
972    pub fn type_length(&self) -> i32 {
973        match self.primitive_type.as_ref() {
974            Type::PrimitiveType { type_length, .. } => *type_length,
975            _ => panic!("Expected primitive type!"),
976        }
977    }
978
979    /// Returns type precision for this column.
980    /// Note that it will panic if called on a non-primitive type.
981    pub fn type_precision(&self) -> i32 {
982        match self.primitive_type.as_ref() {
983            Type::PrimitiveType { precision, .. } => *precision,
984            _ => panic!("Expected primitive type!"),
985        }
986    }
987
988    /// Returns type scale for this column.
989    /// Note that it will panic if called on a non-primitive type.
990    pub fn type_scale(&self) -> i32 {
991        match self.primitive_type.as_ref() {
992            Type::PrimitiveType { scale, .. } => *scale,
993            _ => panic!("Expected primitive type!"),
994        }
995    }
996
997    /// Returns the sort order for this column
998    pub fn sort_order(&self) -> SortOrder {
999        ColumnOrder::sort_order_for_type(
1000            self.logical_type_ref(),
1001            self.converted_type(),
1002            self.physical_type(),
1003        )
1004    }
1005}
1006
1007/// Schema of a Parquet file.
1008///
1009/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
1010/// each primitive (leaf) column.
1011///
1012/// # Example
1013/// ```
1014/// # use std::sync::Arc;
1015/// use parquet::schema::types::{SchemaDescriptor, Type};
1016/// use parquet::basic; // note there are two `Type`s that are different
1017/// // Schema for a table with two columns: "a" (int64) and "b" (int32, stored as a date)
1018/// let descriptor = SchemaDescriptor::new(
1019///   Arc::new(
1020///     Type::group_type_builder("my_schema")
1021///       .with_fields(vec![
1022///         Arc::new(
1023///          Type::primitive_type_builder("a", basic::Type::INT64)
1024///           .build().unwrap()
1025///         ),
1026///         Arc::new(
1027///          Type::primitive_type_builder("b", basic::Type::INT32)
1028///           .with_converted_type(basic::ConvertedType::DATE)
1029///           .with_logical_type(Some(basic::LogicalType::Date))
1030///           .build().unwrap()
1031///         ),
1032///      ])
1033///      .build().unwrap()
1034///   )
1035/// );
1036/// ```
1037#[derive(PartialEq, Clone)]
1038pub struct SchemaDescriptor {
1039    /// The top-level logical schema (the "message" type).
1040    ///
1041    /// This must be a [`Type::GroupType`] where each field is a root
1042    /// column type in the schema.
1043    schema: TypePtr,
1044
1045    /// The descriptors for the physical type of each leaf column in this schema
1046    ///
1047    /// Constructed from `schema` in DFS order.
1048    leaves: Vec<ColumnDescPtr>,
1049
1050    /// Mapping from a leaf column's index to the root column index that it
1051    /// comes from.
1052    ///
1053    /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
1054    /// ```text
1055    /// -- a  <-----+
1056    /// -- -- b     |
1057    /// -- -- -- c  |
1058    /// -- -- -- -- d
1059    /// ```
1060    leaf_to_base: Vec<usize>,
1061}
1062
1063impl fmt::Debug for SchemaDescriptor {
1064    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1065        // Skip leaves and leaf_to_base as they only a cache information already found in `schema`
1066        f.debug_struct("SchemaDescriptor")
1067            .field("schema", &self.schema)
1068            .finish()
1069    }
1070}
1071
1072// Need to implement HeapSize in this module as the fields are private
1073impl HeapSize for SchemaDescriptor {
1074    fn heap_size(&self) -> usize {
1075        self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
1076    }
1077}
1078
1079impl SchemaDescriptor {
1080    /// Creates new schema descriptor from Parquet schema.
1081    pub fn new(tp: TypePtr) -> Self {
1082        const INIT_SCHEMA_DEPTH: usize = 16;
1083        assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
1084        // unwrap should be safe since we just asserted tp is a group
1085        let n_leaves = num_leaves(&tp).unwrap();
1086        let mut leaves = Vec::with_capacity(n_leaves);
1087        let mut leaf_to_base = Vec::with_capacity(n_leaves);
1088        let mut path = Vec::with_capacity(INIT_SCHEMA_DEPTH);
1089        for (root_idx, f) in tp.get_fields().iter().enumerate() {
1090            path.clear();
1091            build_tree(
1092                f,
1093                root_idx,
1094                0,
1095                0,
1096                0,
1097                &mut leaves,
1098                &mut leaf_to_base,
1099                &mut path,
1100            );
1101        }
1102
1103        Self {
1104            schema: tp,
1105            leaves,
1106            leaf_to_base,
1107        }
1108    }
1109
1110    /// Returns [`ColumnDescriptor`] for a field position.
1111    pub fn column(&self, i: usize) -> ColumnDescPtr {
1112        assert!(
1113            i < self.leaves.len(),
1114            "Index out of bound: {} not in [0, {})",
1115            i,
1116            self.leaves.len()
1117        );
1118        self.leaves[i].clone()
1119    }
1120
1121    /// Returns slice of [`ColumnDescriptor`].
1122    pub fn columns(&self) -> &[ColumnDescPtr] {
1123        &self.leaves
1124    }
1125
1126    /// Returns number of leaf-level columns.
1127    pub fn num_columns(&self) -> usize {
1128        self.leaves.len()
1129    }
1130
1131    /// Returns column root [`Type`] for a leaf position.
1132    pub fn get_column_root(&self, i: usize) -> &Type {
1133        let result = self.column_root_of(i);
1134        result.as_ref()
1135    }
1136
1137    /// Returns column root [`Type`] pointer for a leaf position.
1138    pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1139        let result = self.column_root_of(i);
1140        result.clone()
1141    }
1142
1143    /// Returns the index of the root column for a field position
1144    pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1145        assert!(
1146            leaf < self.leaves.len(),
1147            "Index out of bound: {} not in [0, {})",
1148            leaf,
1149            self.leaves.len()
1150        );
1151
1152        *self
1153            .leaf_to_base
1154            .get(leaf)
1155            .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1156    }
1157
1158    fn column_root_of(&self, i: usize) -> &TypePtr {
1159        &self.schema.get_fields()[self.get_column_root_idx(i)]
1160    }
1161
1162    /// Returns schema as [`Type`].
1163    pub fn root_schema(&self) -> &Type {
1164        self.schema.as_ref()
1165    }
1166
1167    /// Returns schema as [`TypePtr`] for cheap cloning.
1168    pub fn root_schema_ptr(&self) -> TypePtr {
1169        self.schema.clone()
1170    }
1171
1172    /// Returns schema name.
1173    pub fn name(&self) -> &str {
1174        self.schema.name()
1175    }
1176}
1177
1178// walk tree and count nodes
1179pub(crate) fn num_nodes(tp: &TypePtr) -> Result<usize> {
1180    if !tp.is_group() {
1181        return Err(general_err!("Root schema must be Group type"));
1182    }
1183    let mut n_nodes = 1usize; // count root
1184    for f in tp.get_fields().iter() {
1185        count_nodes(f, &mut n_nodes);
1186    }
1187    Ok(n_nodes)
1188}
1189
1190pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) {
1191    *n_nodes += 1;
1192    if let Type::GroupType { fields, .. } = tp.as_ref() {
1193        for f in fields {
1194            count_nodes(f, n_nodes);
1195        }
1196    }
1197}
1198
1199// do a quick walk of the tree to get proper sizing for SchemaDescriptor arrays
1200fn num_leaves(tp: &TypePtr) -> Result<usize> {
1201    if !tp.is_group() {
1202        return Err(general_err!("Root schema must be Group type"));
1203    }
1204    let mut n_leaves = 0usize;
1205    for f in tp.get_fields().iter() {
1206        count_leaves(f, &mut n_leaves);
1207    }
1208    Ok(n_leaves)
1209}
1210
1211fn count_leaves(tp: &TypePtr, n_leaves: &mut usize) {
1212    match tp.as_ref() {
1213        Type::PrimitiveType { .. } => *n_leaves += 1,
1214        Type::GroupType { fields, .. } => {
1215            for f in fields {
1216                count_leaves(f, n_leaves);
1217            }
1218        }
1219    }
1220}
1221
1222#[allow(clippy::too_many_arguments)]
1223fn build_tree<'a>(
1224    tp: &'a TypePtr,
1225    root_idx: usize,
1226    mut max_rep_level: i16,
1227    mut max_def_level: i16,
1228    mut repeated_ancestor_def_level: i16,
1229    leaves: &mut Vec<ColumnDescPtr>,
1230    leaf_to_base: &mut Vec<usize>,
1231    path_so_far: &mut Vec<&'a str>,
1232) {
1233    assert!(tp.get_basic_info().has_repetition());
1234
1235    path_so_far.push(tp.name());
1236    match tp.get_basic_info().repetition() {
1237        Repetition::OPTIONAL => {
1238            max_def_level += 1;
1239        }
1240        Repetition::REPEATED => {
1241            max_def_level += 1;
1242            max_rep_level += 1;
1243            repeated_ancestor_def_level = max_def_level;
1244        }
1245        _ => {}
1246    }
1247
1248    match tp.as_ref() {
1249        Type::PrimitiveType { .. } => {
1250            let mut path: Vec<String> = vec![];
1251            path.extend(path_so_far.iter().copied().map(String::from));
1252            let desc = ColumnDescriptor::new_with_repeated_ancestor(
1253                tp.clone(),
1254                max_def_level,
1255                max_rep_level,
1256                ColumnPath::new(path),
1257                repeated_ancestor_def_level,
1258            );
1259            leaves.push(Arc::new(desc));
1260            leaf_to_base.push(root_idx);
1261        }
1262        Type::GroupType { fields, .. } => {
1263            for f in fields {
1264                build_tree(
1265                    f,
1266                    root_idx,
1267                    max_rep_level,
1268                    max_def_level,
1269                    repeated_ancestor_def_level,
1270                    leaves,
1271                    leaf_to_base,
1272                    path_so_far,
1273                );
1274                path_so_far.pop();
1275            }
1276        }
1277    }
1278}
1279
1280/// Checks if the logical type is valid.
1281fn check_logical_type(logical_type: &Option<LogicalType>) -> Result<()> {
1282    if let Some(LogicalType::Integer(IntType { bit_width, .. })) = logical_type {
1283        if *bit_width != 8 && *bit_width != 16 && *bit_width != 32 && *bit_width != 64 {
1284            return Err(general_err!(
1285                "Bit width must be 8, 16, 32, or 64 for Integer logical type"
1286            ));
1287        }
1288    }
1289    Ok(())
1290}
1291
1292// convert thrift decoded array of `SchemaElement` into this crate's representation of
1293// parquet types. this function consumes `elements`.
1294pub(crate) fn parquet_schema_from_array<'a>(elements: Vec<SchemaElement<'a>>) -> Result<TypePtr> {
1295    let mut index = 0;
1296    let num_elements = elements.len();
1297    let mut schema_nodes = Vec::with_capacity(1); // there should only be one element when done
1298
1299    // turn into iterator so we can take ownership of elements of the vector
1300    let mut elements = elements.into_iter();
1301
1302    while index < num_elements {
1303        let t = schema_from_array_helper(&mut elements, num_elements, index)?;
1304        index = t.0;
1305        schema_nodes.push(t.1);
1306    }
1307    if schema_nodes.len() != 1 {
1308        return Err(general_err!(
1309            "Expected exactly one root node, but found {}",
1310            schema_nodes.len()
1311        ));
1312    }
1313
1314    if !schema_nodes[0].is_group() {
1315        return Err(general_err!("Expected root node to be a group type"));
1316    }
1317
1318    Ok(schema_nodes.remove(0))
1319}
1320
1321// recursive helper function for schema conversion
1322fn schema_from_array_helper<'a>(
1323    elements: &mut IntoIter<SchemaElement<'a>>,
1324    num_elements: usize,
1325    index: usize,
1326) -> Result<(usize, TypePtr)> {
1327    // Whether or not the current node is root (message type).
1328    // There is only one message type node in the schema tree.
1329    let is_root_node = index == 0;
1330
1331    if index >= num_elements {
1332        return Err(general_err!(
1333            "Index out of bound, index = {}, len = {}",
1334            index,
1335            num_elements
1336        ));
1337    }
1338    let element = elements.next().expect("schema vector should not be empty");
1339
1340    // Check for empty schema
1341    if let (true, None | Some(0)) = (is_root_node, element.num_children) {
1342        let builder = Type::group_type_builder(element.name);
1343        return Ok((index + 1, Arc::new(builder.build().unwrap())));
1344    }
1345
1346    let converted_type = element.converted_type.unwrap_or(ConvertedType::NONE);
1347
1348    // LogicalType is prefered to ConvertedType, but both may be present.
1349    let logical_type = element.logical_type;
1350
1351    check_logical_type(&logical_type)?;
1352
1353    let field_id = element.field_id;
1354    match element.num_children {
1355        // From parquet-format:
1356        //   The children count is used to construct the nested relationship.
1357        //   This field is not set when the element is a primitive type
1358        // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we
1359        // have to handle this case too.
1360        None | Some(0) => {
1361            // primitive type
1362            if element.repetition_type.is_none() {
1363                return Err(general_err!(
1364                    "Repetition level must be defined for a primitive type"
1365                ));
1366            }
1367            let repetition = element.repetition_type.unwrap();
1368            if let Some(physical_type) = element.r#type {
1369                let length = element.type_length.unwrap_or(-1);
1370                let scale = element.scale.unwrap_or(-1);
1371                let precision = element.precision.unwrap_or(-1);
1372                let name = element.name;
1373                let builder = Type::primitive_type_builder(name, physical_type)
1374                    .with_repetition(repetition)
1375                    .with_converted_type(converted_type)
1376                    .with_logical_type(logical_type)
1377                    .with_length(length)
1378                    .with_precision(precision)
1379                    .with_scale(scale)
1380                    .with_id(field_id);
1381                Ok((index + 1, Arc::new(builder.build()?)))
1382            } else {
1383                let mut builder = Type::group_type_builder(element.name)
1384                    .with_converted_type(converted_type)
1385                    .with_logical_type(logical_type)
1386                    .with_id(field_id);
1387                if !is_root_node {
1388                    // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1389                    // REPEATED for root node.
1390                    //
1391                    // We only set repetition for group types that are not top-level message
1392                    // type. According to parquet-format:
1393                    //   Root of the schema does not have a repetition_type.
1394                    //   All other types must have one.
1395                    builder = builder.with_repetition(repetition);
1396                }
1397                Ok((index + 1, Arc::new(builder.build().unwrap())))
1398            }
1399        }
1400        Some(n) => {
1401            let repetition = element.repetition_type;
1402
1403            let mut fields = Vec::with_capacity(usize::try_from(n)?);
1404            let mut next_index = index + 1;
1405            for _ in 0..n {
1406                let child_result = schema_from_array_helper(elements, num_elements, next_index)?;
1407                next_index = child_result.0;
1408                fields.push(child_result.1);
1409            }
1410
1411            let mut builder = Type::group_type_builder(element.name)
1412                .with_converted_type(converted_type)
1413                .with_logical_type(logical_type)
1414                .with_fields(fields)
1415                .with_id(field_id);
1416
1417            // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1418            // REPEATED for root node.
1419            //
1420            // We only set repetition for group types that are not top-level message
1421            // type. According to parquet-format:
1422            //   Root of the schema does not have a repetition_type.
1423            //   All other types must have one.
1424            if !is_root_node {
1425                let Some(rep) = repetition else {
1426                    return Err(general_err!(
1427                        "Repetition level must be defined for non-root types"
1428                    ));
1429                };
1430                builder = builder.with_repetition(rep);
1431            }
1432            Ok((next_index, Arc::new(builder.build()?)))
1433        }
1434    }
1435}
1436
1437#[cfg(test)]
1438mod tests {
1439    use super::*;
1440
1441    use crate::{
1442        file::metadata::thrift::tests::{buf_to_schema_list, roundtrip_schema, schema_to_buf},
1443        schema::parser::parse_message_type,
1444    };
1445
1446    // TODO: add tests for v2 types
1447
1448    #[test]
1449    fn test_primitive_type() {
1450        let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1451            .with_logical_type(Some(LogicalType::integer(32, true)))
1452            .with_id(Some(0))
1453            .build();
1454        assert!(result.is_ok());
1455
1456        if let Ok(tp) = result {
1457            assert!(tp.is_primitive());
1458            assert!(!tp.is_group());
1459            let basic_info = tp.get_basic_info();
1460            assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1461            assert_eq!(
1462                basic_info.logical_type_ref(),
1463                Some(&LogicalType::integer(32, true))
1464            );
1465            assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1466            assert_eq!(basic_info.id(), 0);
1467            match tp {
1468                Type::PrimitiveType { physical_type, .. } => {
1469                    assert_eq!(physical_type, PhysicalType::INT32);
1470                }
1471                _ => panic!(),
1472            }
1473        }
1474
1475        // Test illegal inputs with logical type
1476        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1477            .with_repetition(Repetition::REPEATED)
1478            .with_logical_type(Some(LogicalType::integer(8, true)))
1479            .build();
1480        assert!(result.is_err());
1481        if let Err(e) = result {
1482            assert_eq!(
1483                format!("{e}"),
1484                "Parquet error: Cannot annotate Integer(IntType { bit_width: 8, is_signed: true }) from INT64 for field 'foo'"
1485            );
1486        }
1487
1488        // Test illegal inputs with converted type
1489        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1490            .with_repetition(Repetition::REPEATED)
1491            .with_converted_type(ConvertedType::BSON)
1492            .build();
1493        assert!(result.is_err());
1494        if let Err(e) = result {
1495            assert_eq!(
1496                format!("{e}"),
1497                "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1498            );
1499        }
1500
1501        result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1502            .with_repetition(Repetition::REQUIRED)
1503            .with_converted_type(ConvertedType::DECIMAL)
1504            .with_precision(-1)
1505            .with_scale(-1)
1506            .build();
1507        assert!(result.is_err());
1508        if let Err(e) = result {
1509            assert_eq!(
1510                format!("{e}"),
1511                "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1512            );
1513        }
1514
1515        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1516            .with_repetition(Repetition::REQUIRED)
1517            .with_logical_type(Some(LogicalType::decimal(32, 12)))
1518            .with_precision(-1)
1519            .with_scale(-1)
1520            .build();
1521        assert!(result.is_err());
1522        if let Err(e) = result {
1523            assert_eq!(
1524                format!("{e}"),
1525                "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1526            );
1527        }
1528
1529        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1530            .with_repetition(Repetition::REQUIRED)
1531            .with_converted_type(ConvertedType::DECIMAL)
1532            .with_precision(-1)
1533            .with_scale(-1)
1534            .build();
1535        assert!(result.is_err());
1536        if let Err(e) = result {
1537            assert_eq!(
1538                format!("{e}"),
1539                "Parquet error: Invalid DECIMAL precision: -1"
1540            );
1541        }
1542
1543        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1544            .with_repetition(Repetition::REQUIRED)
1545            .with_converted_type(ConvertedType::DECIMAL)
1546            .with_precision(0)
1547            .with_scale(-1)
1548            .build();
1549        assert!(result.is_err());
1550        if let Err(e) = result {
1551            assert_eq!(
1552                format!("{e}"),
1553                "Parquet error: Invalid DECIMAL precision: 0"
1554            );
1555        }
1556
1557        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1558            .with_repetition(Repetition::REQUIRED)
1559            .with_converted_type(ConvertedType::DECIMAL)
1560            .with_precision(1)
1561            .with_scale(-1)
1562            .build();
1563        assert!(result.is_err());
1564        if let Err(e) = result {
1565            assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1566        }
1567
1568        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1569            .with_repetition(Repetition::REQUIRED)
1570            .with_converted_type(ConvertedType::DECIMAL)
1571            .with_precision(1)
1572            .with_scale(2)
1573            .build();
1574        assert!(result.is_err());
1575        if let Err(e) = result {
1576            assert_eq!(
1577                format!("{e}"),
1578                "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1579            );
1580        }
1581
1582        // It is OK if precision == scale
1583        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1584            .with_repetition(Repetition::REQUIRED)
1585            .with_converted_type(ConvertedType::DECIMAL)
1586            .with_precision(1)
1587            .with_scale(1)
1588            .build();
1589        assert!(result.is_ok());
1590
1591        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1592            .with_repetition(Repetition::REQUIRED)
1593            .with_converted_type(ConvertedType::DECIMAL)
1594            .with_precision(18)
1595            .with_scale(2)
1596            .build();
1597        assert!(result.is_err());
1598        if let Err(e) = result {
1599            assert_eq!(
1600                format!("{e}"),
1601                "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1602            );
1603        }
1604
1605        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1606            .with_repetition(Repetition::REQUIRED)
1607            .with_converted_type(ConvertedType::DECIMAL)
1608            .with_precision(32)
1609            .with_scale(2)
1610            .build();
1611        assert!(result.is_err());
1612        if let Err(e) = result {
1613            assert_eq!(
1614                format!("{e}"),
1615                "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1616            );
1617        }
1618
1619        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1620            .with_repetition(Repetition::REQUIRED)
1621            .with_converted_type(ConvertedType::DECIMAL)
1622            .with_length(5)
1623            .with_precision(12)
1624            .with_scale(2)
1625            .build();
1626        assert!(result.is_err());
1627        if let Err(e) = result {
1628            assert_eq!(
1629                format!("{e}"),
1630                "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1631            );
1632        }
1633
1634        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1635            .with_repetition(Repetition::REQUIRED)
1636            .with_converted_type(ConvertedType::UINT_8)
1637            .build();
1638        assert!(result.is_err());
1639        if let Err(e) = result {
1640            assert_eq!(
1641                format!("{e}"),
1642                "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1643            );
1644        }
1645
1646        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1647            .with_repetition(Repetition::REQUIRED)
1648            .with_converted_type(ConvertedType::TIME_MICROS)
1649            .build();
1650        assert!(result.is_err());
1651        if let Err(e) = result {
1652            assert_eq!(
1653                format!("{e}"),
1654                "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1655            );
1656        }
1657
1658        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1659            .with_repetition(Repetition::REQUIRED)
1660            .with_converted_type(ConvertedType::INTERVAL)
1661            .build();
1662        assert!(result.is_err());
1663        if let Err(e) = result {
1664            assert_eq!(
1665                format!("{e}"),
1666                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1667            );
1668        }
1669
1670        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1671            .with_repetition(Repetition::REQUIRED)
1672            .with_converted_type(ConvertedType::INTERVAL)
1673            .with_length(1)
1674            .build();
1675        assert!(result.is_err());
1676        if let Err(e) = result {
1677            assert_eq!(
1678                format!("{e}"),
1679                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1680            );
1681        }
1682
1683        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1684            .with_repetition(Repetition::REQUIRED)
1685            .with_converted_type(ConvertedType::ENUM)
1686            .build();
1687        assert!(result.is_err());
1688        if let Err(e) = result {
1689            assert_eq!(
1690                format!("{e}"),
1691                "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1692            );
1693        }
1694
1695        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1696            .with_repetition(Repetition::REQUIRED)
1697            .with_converted_type(ConvertedType::MAP)
1698            .build();
1699        assert!(result.is_err());
1700        if let Err(e) = result {
1701            assert_eq!(
1702                format!("{e}"),
1703                "Parquet error: MAP cannot be applied to primitive field 'foo'"
1704            );
1705        }
1706
1707        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1708            .with_repetition(Repetition::REQUIRED)
1709            .with_converted_type(ConvertedType::DECIMAL)
1710            .with_length(-1)
1711            .build();
1712        assert!(result.is_err());
1713        if let Err(e) = result {
1714            assert_eq!(
1715                format!("{e}"),
1716                "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1717            );
1718        }
1719
1720        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1721            .with_repetition(Repetition::REQUIRED)
1722            .with_logical_type(Some(LogicalType::Float16))
1723            .with_length(2)
1724            .build();
1725        assert!(result.is_ok());
1726
1727        // Can't be other than FIXED_LEN_BYTE_ARRAY for physical type
1728        result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1729            .with_repetition(Repetition::REQUIRED)
1730            .with_logical_type(Some(LogicalType::Float16))
1731            .with_length(2)
1732            .build();
1733        assert!(result.is_err());
1734        if let Err(e) = result {
1735            assert_eq!(
1736                format!("{e}"),
1737                "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1738            );
1739        }
1740
1741        // Must have length 2
1742        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1743            .with_repetition(Repetition::REQUIRED)
1744            .with_logical_type(Some(LogicalType::Float16))
1745            .with_length(4)
1746            .build();
1747        assert!(result.is_err());
1748        if let Err(e) = result {
1749            assert_eq!(
1750                format!("{e}"),
1751                "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1752            );
1753        }
1754
1755        // Must have length 16
1756        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1757            .with_repetition(Repetition::REQUIRED)
1758            .with_logical_type(Some(LogicalType::Uuid))
1759            .with_length(15)
1760            .build();
1761        assert!(result.is_err());
1762        if let Err(e) = result {
1763            assert_eq!(
1764                format!("{e}"),
1765                "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1766            );
1767        }
1768
1769        // test unknown logical types are ok
1770        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1771            .with_logical_type(Some(LogicalType::_Unknown { field_id: 100 }))
1772            .build();
1773        assert!(result.is_ok());
1774    }
1775
1776    #[test]
1777    fn test_group_type() {
1778        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1779            .with_converted_type(ConvertedType::INT_32)
1780            .with_id(Some(0))
1781            .build();
1782        assert!(f1.is_ok());
1783        let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1784            .with_converted_type(ConvertedType::UTF8)
1785            .with_id(Some(1))
1786            .build();
1787        assert!(f2.is_ok());
1788
1789        let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1790
1791        let result = Type::group_type_builder("foo")
1792            .with_repetition(Repetition::REPEATED)
1793            .with_logical_type(Some(LogicalType::List))
1794            .with_fields(fields)
1795            .with_id(Some(1))
1796            .build();
1797        assert!(result.is_ok());
1798
1799        let tp = result.unwrap();
1800        let basic_info = tp.get_basic_info();
1801        assert!(tp.is_group());
1802        assert!(!tp.is_primitive());
1803        assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1804        assert_eq!(basic_info.logical_type_ref(), Some(&LogicalType::List));
1805        assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1806        assert_eq!(basic_info.id(), 1);
1807        assert_eq!(tp.get_fields().len(), 2);
1808        assert_eq!(tp.get_fields()[0].name(), "f1");
1809        assert_eq!(tp.get_fields()[1].name(), "f2");
1810    }
1811
1812    #[test]
1813    fn test_column_descriptor() {
1814        let result = test_column_descriptor_helper();
1815        assert!(
1816            result.is_ok(),
1817            "Expected result to be OK but got err:\n {}",
1818            result.unwrap_err()
1819        );
1820    }
1821
1822    fn test_column_descriptor_helper() -> Result<()> {
1823        let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1824            .with_converted_type(ConvertedType::UTF8)
1825            .build()?;
1826
1827        let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1828
1829        assert_eq!(descr.path(), &ColumnPath::from("name"));
1830        assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1831        assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1832        assert_eq!(descr.max_def_level(), 4);
1833        assert_eq!(descr.max_rep_level(), 1);
1834        assert_eq!(descr.name(), "name");
1835        assert_eq!(descr.type_length(), -1);
1836        assert_eq!(descr.type_precision(), -1);
1837        assert_eq!(descr.type_scale(), -1);
1838
1839        Ok(())
1840    }
1841
1842    #[test]
1843    fn test_schema_descriptor() {
1844        let result = test_schema_descriptor_helper();
1845        assert!(
1846            result.is_ok(),
1847            "Expected result to be OK but got err:\n {}",
1848            result.unwrap_err()
1849        );
1850    }
1851
1852    // A helper fn to avoid handling the results from type creation
1853    fn test_schema_descriptor_helper() -> Result<()> {
1854        let mut fields = vec![];
1855
1856        let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1857            .with_repetition(Repetition::REQUIRED)
1858            .with_converted_type(ConvertedType::INT_32)
1859            .build()?;
1860        fields.push(Arc::new(inta));
1861        let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1862            .with_converted_type(ConvertedType::INT_64)
1863            .build()?;
1864        fields.push(Arc::new(intb));
1865        let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1866            .with_repetition(Repetition::REPEATED)
1867            .with_converted_type(ConvertedType::UTF8)
1868            .build()?;
1869        fields.push(Arc::new(intc));
1870
1871        // 3-level list encoding
1872        let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1873            .with_repetition(Repetition::REQUIRED)
1874            .with_converted_type(ConvertedType::INT_64)
1875            .build()?;
1876        let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1877        let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1878            .with_repetition(Repetition::REPEATED)
1879            .with_converted_type(ConvertedType::INT_32)
1880            .build()?;
1881        let list = Type::group_type_builder("records")
1882            .with_repetition(Repetition::REPEATED)
1883            .with_converted_type(ConvertedType::LIST)
1884            .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1885            .build()?;
1886        let bag = Type::group_type_builder("bag")
1887            .with_repetition(Repetition::OPTIONAL)
1888            .with_fields(vec![Arc::new(list)])
1889            .build()?;
1890        fields.push(Arc::new(bag));
1891
1892        let schema = Type::group_type_builder("schema")
1893            .with_repetition(Repetition::REPEATED)
1894            .with_fields(fields)
1895            .build()?;
1896        let descr = SchemaDescriptor::new(Arc::new(schema));
1897
1898        let nleaves = 6;
1899        assert_eq!(descr.num_columns(), nleaves);
1900
1901        //                             mdef mrep
1902        // required int32 a            0    0
1903        // optional int64 b            1    0
1904        // repeated byte_array c       1    1
1905        // optional group bag          1    0
1906        //   repeated group records    2    1
1907        //     required int64 item1    2    1
1908        //     optional boolean item2  3    1
1909        //     repeated int32 item3    3    2
1910        let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1911        let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1912
1913        for i in 0..nleaves {
1914            let col = descr.column(i);
1915            assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1916            assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1917        }
1918
1919        assert_eq!(descr.column(0).path().string(), "a");
1920        assert_eq!(descr.column(1).path().string(), "b");
1921        assert_eq!(descr.column(2).path().string(), "c");
1922        assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1923        assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1924        assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1925
1926        assert_eq!(descr.get_column_root(0).name(), "a");
1927        assert_eq!(descr.get_column_root(3).name(), "bag");
1928        assert_eq!(descr.get_column_root(4).name(), "bag");
1929        assert_eq!(descr.get_column_root(5).name(), "bag");
1930
1931        Ok(())
1932    }
1933
1934    #[test]
1935    fn test_schema_build_tree_def_rep_levels() {
1936        let message_type = "
1937    message spark_schema {
1938      REQUIRED INT32 a;
1939      OPTIONAL group b {
1940        OPTIONAL INT32 _1;
1941        OPTIONAL INT32 _2;
1942      }
1943      OPTIONAL group c (LIST) {
1944        REPEATED group list {
1945          OPTIONAL INT32 element;
1946        }
1947      }
1948    }
1949    ";
1950        let schema = parse_message_type(message_type).expect("should parse schema");
1951        let descr = SchemaDescriptor::new(Arc::new(schema));
1952        // required int32 a
1953        assert_eq!(descr.column(0).max_def_level(), 0);
1954        assert_eq!(descr.column(0).max_rep_level(), 0);
1955        // optional int32 b._1
1956        assert_eq!(descr.column(1).max_def_level(), 2);
1957        assert_eq!(descr.column(1).max_rep_level(), 0);
1958        // optional int32 b._2
1959        assert_eq!(descr.column(2).max_def_level(), 2);
1960        assert_eq!(descr.column(2).max_rep_level(), 0);
1961        // repeated optional int32 c.list.element
1962        assert_eq!(descr.column(3).max_def_level(), 3);
1963        assert_eq!(descr.column(3).max_rep_level(), 1);
1964    }
1965
1966    #[test]
1967    fn test_schema_build_tree_repeated_ancestor_def_level() {
1968        // Flat columns: no REPEATED ancestor → repeated_ancestor_def_level = 0
1969        let message_type = "
1970    message m {
1971      REQUIRED INT32 a;
1972      OPTIONAL INT32 b;
1973      OPTIONAL group s {
1974        OPTIONAL INT32 x;
1975      }
1976    }
1977    ";
1978        let schema = parse_message_type(message_type).expect("should parse schema");
1979        let descr = SchemaDescriptor::new(Arc::new(schema));
1980        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 0); // a
1981        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 0); // b
1982        assert_eq!(descr.column(2).repeated_ancestor_def_level(), 0); // s.x
1983
1984        // Standard list: OPTIONAL outer, REPEATED group, OPTIONAL element
1985        // repeated_ancestor_def_level is the def_level at the REPEATED group (= 2)
1986        let message_type = "
1987    message m {
1988      OPTIONAL group c (LIST) {
1989        REPEATED group list {
1990          OPTIONAL INT32 element;
1991        }
1992      }
1993    }
1994    ";
1995        let schema = parse_message_type(message_type).expect("should parse schema");
1996        let descr = SchemaDescriptor::new(Arc::new(schema));
1997        // c(optional)=1, list(repeated)=2, element(optional)=3
1998        assert_eq!(descr.column(0).max_def_level(), 3);
1999        assert_eq!(descr.column(0).max_rep_level(), 1);
2000        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2);
2001
2002        // Required list: REQUIRED outer, REPEATED group, REQUIRED element
2003        // No OPTIONAL nodes between REPEATED and leaf, so repeated_ancestor_def_level == max_def_level
2004        let message_type = "
2005    message m {
2006      REQUIRED group c (LIST) {
2007        REPEATED group list {
2008          REQUIRED INT32 element;
2009        }
2010      }
2011    }
2012    ";
2013        let schema = parse_message_type(message_type).expect("should parse schema");
2014        let descr = SchemaDescriptor::new(Arc::new(schema));
2015        // list(repeated)=1, element(required)=1
2016        assert_eq!(descr.column(0).max_def_level(), 1);
2017        assert_eq!(descr.column(0).max_rep_level(), 1);
2018        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 1);
2019
2020        // Nested lists: innermost REPEATED wins
2021        let message_type = "
2022    message m {
2023      OPTIONAL group outer (LIST) {
2024        REPEATED group list {
2025          OPTIONAL group inner (LIST) {
2026            REPEATED group list2 {
2027              OPTIONAL INT32 element;
2028            }
2029          }
2030        }
2031      }
2032    }
2033    ";
2034        let schema = parse_message_type(message_type).expect("should parse schema");
2035        let descr = SchemaDescriptor::new(Arc::new(schema));
2036        // outer(opt)=1, list(rep)=2, inner(opt)=3, list2(rep)=4, element(opt)=5
2037        assert_eq!(descr.column(0).max_def_level(), 5);
2038        assert_eq!(descr.column(0).max_rep_level(), 2);
2039        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 4);
2040
2041        // Struct inside list: all sibling leaves share the same repeated_ancestor_def_level
2042        let message_type = "
2043    message m {
2044      OPTIONAL group bag (LIST) {
2045        REPEATED group list {
2046          REQUIRED group item {
2047            OPTIONAL INT32 x;
2048            REQUIRED INT32 y;
2049          }
2050        }
2051      }
2052    }
2053    ";
2054        let schema = parse_message_type(message_type).expect("should parse schema");
2055        let descr = SchemaDescriptor::new(Arc::new(schema));
2056        // bag(opt)=1, list(rep)=2, item(req)=2, x(opt)=3
2057        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // bag.list.item.x
2058        // bag(opt)=1, list(rep)=2, item(req)=2, y(req)=2
2059        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // bag.list.item.y
2060
2061        // Map type: key (required) and value (optional) under the same REPEATED group
2062        let message_type = "
2063    message m {
2064      OPTIONAL group my_map (MAP) {
2065        REPEATED group key_value {
2066          REQUIRED BYTE_ARRAY key (UTF8);
2067          OPTIONAL INT32 value;
2068        }
2069      }
2070    }
2071    ";
2072        let schema = parse_message_type(message_type).expect("should parse schema");
2073        let descr = SchemaDescriptor::new(Arc::new(schema));
2074        // my_map(opt)=1, key_value(rep)=2, key(req)=2
2075        assert_eq!(descr.column(0).max_def_level(), 2);
2076        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // key: max_def == repeated_ancestor
2077        // my_map(opt)=1, key_value(rep)=2, value(opt)=3
2078        assert_eq!(descr.column(1).max_def_level(), 3);
2079        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // value: max_def > repeated_ancestor
2080    }
2081
2082    #[test]
2083    #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
2084    fn test_get_physical_type_panic() {
2085        let list = Type::group_type_builder("records")
2086            .with_repetition(Repetition::REPEATED)
2087            .build()
2088            .unwrap();
2089        list.get_physical_type();
2090    }
2091
2092    #[test]
2093    fn test_get_physical_type_primitive() {
2094        let f = Type::primitive_type_builder("f", PhysicalType::INT64)
2095            .build()
2096            .unwrap();
2097        assert_eq!(f.get_physical_type(), PhysicalType::INT64);
2098
2099        let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
2100            .build()
2101            .unwrap();
2102        assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
2103    }
2104
2105    #[test]
2106    fn test_check_contains_primitive_primitive() {
2107        // OK
2108        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2109            .build()
2110            .unwrap();
2111        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2112            .build()
2113            .unwrap();
2114        assert!(f1.check_contains(&f2));
2115
2116        // OK: different logical type does not affect check_contains
2117        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2118            .with_converted_type(ConvertedType::UINT_8)
2119            .build()
2120            .unwrap();
2121        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2122            .with_converted_type(ConvertedType::UINT_16)
2123            .build()
2124            .unwrap();
2125        assert!(f1.check_contains(&f2));
2126
2127        // KO: different name
2128        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2129            .build()
2130            .unwrap();
2131        let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
2132            .build()
2133            .unwrap();
2134        assert!(!f1.check_contains(&f2));
2135
2136        // KO: different type
2137        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2138            .build()
2139            .unwrap();
2140        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2141            .build()
2142            .unwrap();
2143        assert!(!f1.check_contains(&f2));
2144
2145        // KO: different repetition
2146        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2147            .with_repetition(Repetition::REQUIRED)
2148            .build()
2149            .unwrap();
2150        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2151            .with_repetition(Repetition::OPTIONAL)
2152            .build()
2153            .unwrap();
2154        assert!(!f1.check_contains(&f2));
2155    }
2156
2157    // function to create a new group type for testing
2158    fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
2159        Type::group_type_builder(name)
2160            .with_repetition(repetition)
2161            .with_fields(types.into_iter().map(Arc::new).collect())
2162            .build()
2163            .unwrap()
2164    }
2165
2166    #[test]
2167    fn test_check_contains_group_group() {
2168        // OK: should match okay with empty fields
2169        let f1 = Type::group_type_builder("f").build().unwrap();
2170        let f2 = Type::group_type_builder("f").build().unwrap();
2171        assert!(f1.check_contains(&f2));
2172        assert!(!f1.is_optional());
2173
2174        // OK: fields match
2175        let f1 = test_new_group_type(
2176            "f",
2177            Repetition::REPEATED,
2178            vec![
2179                Type::primitive_type_builder("f1", PhysicalType::INT32)
2180                    .build()
2181                    .unwrap(),
2182                Type::primitive_type_builder("f2", PhysicalType::INT64)
2183                    .build()
2184                    .unwrap(),
2185            ],
2186        );
2187        let f2 = test_new_group_type(
2188            "f",
2189            Repetition::REPEATED,
2190            vec![
2191                Type::primitive_type_builder("f1", PhysicalType::INT32)
2192                    .build()
2193                    .unwrap(),
2194                Type::primitive_type_builder("f2", PhysicalType::INT64)
2195                    .build()
2196                    .unwrap(),
2197            ],
2198        );
2199        assert!(f1.check_contains(&f2));
2200
2201        // OK: subset of fields
2202        let f1 = test_new_group_type(
2203            "f",
2204            Repetition::REPEATED,
2205            vec![
2206                Type::primitive_type_builder("f1", PhysicalType::INT32)
2207                    .build()
2208                    .unwrap(),
2209                Type::primitive_type_builder("f2", PhysicalType::INT64)
2210                    .build()
2211                    .unwrap(),
2212            ],
2213        );
2214        let f2 = test_new_group_type(
2215            "f",
2216            Repetition::REPEATED,
2217            vec![
2218                Type::primitive_type_builder("f2", PhysicalType::INT64)
2219                    .build()
2220                    .unwrap(),
2221            ],
2222        );
2223        assert!(f1.check_contains(&f2));
2224
2225        // KO: different name
2226        let f1 = Type::group_type_builder("f1").build().unwrap();
2227        let f2 = Type::group_type_builder("f2").build().unwrap();
2228        assert!(!f1.check_contains(&f2));
2229
2230        // KO: different repetition
2231        let f1 = Type::group_type_builder("f")
2232            .with_repetition(Repetition::OPTIONAL)
2233            .build()
2234            .unwrap();
2235        let f2 = Type::group_type_builder("f")
2236            .with_repetition(Repetition::REPEATED)
2237            .build()
2238            .unwrap();
2239        assert!(!f1.check_contains(&f2));
2240
2241        // KO: different fields
2242        let f1 = test_new_group_type(
2243            "f",
2244            Repetition::REPEATED,
2245            vec![
2246                Type::primitive_type_builder("f1", PhysicalType::INT32)
2247                    .build()
2248                    .unwrap(),
2249                Type::primitive_type_builder("f2", PhysicalType::INT64)
2250                    .build()
2251                    .unwrap(),
2252            ],
2253        );
2254        let f2 = test_new_group_type(
2255            "f",
2256            Repetition::REPEATED,
2257            vec![
2258                Type::primitive_type_builder("f1", PhysicalType::INT32)
2259                    .build()
2260                    .unwrap(),
2261                Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2262                    .build()
2263                    .unwrap(),
2264            ],
2265        );
2266        assert!(!f1.check_contains(&f2));
2267
2268        // KO: different fields
2269        let f1 = test_new_group_type(
2270            "f",
2271            Repetition::REPEATED,
2272            vec![
2273                Type::primitive_type_builder("f1", PhysicalType::INT32)
2274                    .build()
2275                    .unwrap(),
2276                Type::primitive_type_builder("f2", PhysicalType::INT64)
2277                    .build()
2278                    .unwrap(),
2279            ],
2280        );
2281        let f2 = test_new_group_type(
2282            "f",
2283            Repetition::REPEATED,
2284            vec![
2285                Type::primitive_type_builder("f3", PhysicalType::INT32)
2286                    .build()
2287                    .unwrap(),
2288            ],
2289        );
2290        assert!(!f1.check_contains(&f2));
2291    }
2292
2293    #[test]
2294    fn test_check_contains_group_primitive() {
2295        // KO: should not match
2296        let f1 = Type::group_type_builder("f").build().unwrap();
2297        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2298            .build()
2299            .unwrap();
2300        assert!(!f1.check_contains(&f2));
2301        assert!(!f2.check_contains(&f1));
2302
2303        // KO: should not match when primitive field is part of group type
2304        let f1 = test_new_group_type(
2305            "f",
2306            Repetition::REPEATED,
2307            vec![
2308                Type::primitive_type_builder("f1", PhysicalType::INT32)
2309                    .build()
2310                    .unwrap(),
2311            ],
2312        );
2313        let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2314            .build()
2315            .unwrap();
2316        assert!(!f1.check_contains(&f2));
2317        assert!(!f2.check_contains(&f1));
2318
2319        // OK: match nested types
2320        let f1 = test_new_group_type(
2321            "a",
2322            Repetition::REPEATED,
2323            vec![
2324                test_new_group_type(
2325                    "b",
2326                    Repetition::REPEATED,
2327                    vec![
2328                        Type::primitive_type_builder("c", PhysicalType::INT32)
2329                            .build()
2330                            .unwrap(),
2331                    ],
2332                ),
2333                Type::primitive_type_builder("d", PhysicalType::INT64)
2334                    .build()
2335                    .unwrap(),
2336                Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2337                    .build()
2338                    .unwrap(),
2339            ],
2340        );
2341        let f2 = test_new_group_type(
2342            "a",
2343            Repetition::REPEATED,
2344            vec![test_new_group_type(
2345                "b",
2346                Repetition::REPEATED,
2347                vec![
2348                    Type::primitive_type_builder("c", PhysicalType::INT32)
2349                        .build()
2350                        .unwrap(),
2351                ],
2352            )],
2353        );
2354        assert!(f1.check_contains(&f2)); // should match
2355        assert!(!f2.check_contains(&f1)); // should fail
2356    }
2357
2358    #[test]
2359    fn test_schema_type_thrift_conversion_err() {
2360        let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2361            .build()
2362            .unwrap();
2363        let schema = Arc::new(schema);
2364        let thrift_schema = schema_to_buf(&schema);
2365        assert!(thrift_schema.is_err());
2366        if let Err(e) = thrift_schema {
2367            assert_eq!(
2368                format!("{e}"),
2369                "Parquet error: Root schema must be Group type"
2370            );
2371        }
2372    }
2373
2374    #[test]
2375    fn test_schema_type_thrift_conversion() {
2376        let message_type = "
2377    message conversions {
2378      REQUIRED INT64 id;
2379      OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2380      OPTIONAL group int_array_Array (LIST) {
2381        REPEATED group list {
2382          OPTIONAL group element (LIST) {
2383            REPEATED group list {
2384              OPTIONAL INT32 element;
2385            }
2386          }
2387        }
2388      }
2389      OPTIONAL group int_map (MAP) {
2390        REPEATED group map (MAP_KEY_VALUE) {
2391          REQUIRED BYTE_ARRAY key (UTF8);
2392          OPTIONAL INT32 value;
2393        }
2394      }
2395      OPTIONAL group int_Map_Array (LIST) {
2396        REPEATED group list {
2397          OPTIONAL group g (MAP) {
2398            REPEATED group map (MAP_KEY_VALUE) {
2399              REQUIRED BYTE_ARRAY key (UTF8);
2400              OPTIONAL group value {
2401                OPTIONAL group H {
2402                  OPTIONAL group i (LIST) {
2403                    REPEATED group list {
2404                      OPTIONAL DOUBLE element;
2405                    }
2406                  }
2407                }
2408              }
2409            }
2410          }
2411        }
2412      }
2413      OPTIONAL group nested_struct {
2414        OPTIONAL INT32 A;
2415        OPTIONAL group b (LIST) {
2416          REPEATED group list {
2417            REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2418          }
2419        }
2420      }
2421    }
2422    ";
2423        let expected_schema = parse_message_type(message_type).unwrap();
2424        let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap();
2425        assert_eq!(result_schema, Arc::new(expected_schema));
2426    }
2427
2428    #[test]
2429    fn test_schema_type_thrift_conversion_decimal() {
2430        let message_type = "
2431    message decimals {
2432      OPTIONAL INT32 field0;
2433      OPTIONAL INT64 field1 (DECIMAL (18, 2));
2434      OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2435      OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2436    }
2437    ";
2438        let expected_schema = parse_message_type(message_type).unwrap();
2439        let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap();
2440        assert_eq!(result_schema, Arc::new(expected_schema));
2441    }
2442
2443    // Tests schema conversion from thrift, when num_children is set to Some(0) for a
2444    // primitive type.
2445    #[test]
2446    fn test_schema_from_thrift_with_num_children_set() {
2447        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2448        let message_type = "
2449    message schema {
2450      OPTIONAL BYTE_ARRAY id (UTF8);
2451      OPTIONAL BYTE_ARRAY name (UTF8);
2452      OPTIONAL BYTE_ARRAY message (UTF8);
2453      OPTIONAL INT32 type (UINT_8);
2454      OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2455      OPTIONAL INT64 __index_level_0__;
2456    }
2457    ";
2458
2459        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2460        let mut buf = schema_to_buf(&expected_schema).unwrap();
2461        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2462
2463        // Change all of None to Some(0)
2464        for elem in &mut thrift_schema[..] {
2465            if elem.num_children.is_none() {
2466                elem.num_children = Some(0);
2467            }
2468        }
2469
2470        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2471        assert_eq!(result_schema, expected_schema);
2472    }
2473
2474    // Sometimes parquet-cpp sets repetition level for the root node, which is against
2475    // the format definition, but we need to handle it by setting it back to None.
2476    #[test]
2477    fn test_schema_from_thrift_root_has_repetition() {
2478        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2479        let message_type = "
2480    message schema {
2481      OPTIONAL BYTE_ARRAY a (UTF8);
2482      OPTIONAL INT32 b (UINT_8);
2483    }
2484    ";
2485
2486        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2487        let mut buf = schema_to_buf(&expected_schema).unwrap();
2488        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2489        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED);
2490
2491        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2492        assert_eq!(result_schema, expected_schema);
2493    }
2494
2495    #[test]
2496    fn test_schema_from_thrift_group_has_no_child() {
2497        let message_type = "message schema {}";
2498
2499        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2500        let mut buf = schema_to_buf(&expected_schema).unwrap();
2501        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2502        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED);
2503
2504        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2505        assert_eq!(result_schema, expected_schema);
2506    }
2507
2508    #[test]
2509    fn test_parquet_schema_from_array_rejects_negative_num_children() {
2510        let elements = vec![SchemaElement {
2511            r#type: None,
2512            type_length: None,
2513            repetition_type: Some(Repetition::REQUIRED),
2514            name: "schema",
2515            num_children: Some(-1),
2516            converted_type: None,
2517            scale: None,
2518            precision: None,
2519            field_id: None,
2520            logical_type: None,
2521        }];
2522        let result = parquet_schema_from_array(elements);
2523        assert!(result.unwrap_err().to_string().contains("Integer overflow"));
2524    }
2525}