Skip to main content

parquet/schema/
types.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains structs and methods to build Parquet schema and schema descriptors.
19
20use std::vec::IntoIter;
21use std::{collections::HashMap, fmt, sync::Arc};
22
23use crate::file::metadata::HeapSize;
24use crate::file::metadata::thrift::SchemaElement;
25
26use crate::basic::{
27    ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType,
28};
29use crate::errors::{ParquetError, Result};
30
31// ----------------------------------------------------------------------
32// Parquet Type definitions
33
34/// Type alias for `Arc<Type>`.
35pub type TypePtr = Arc<Type>;
36/// Type alias for `Arc<SchemaDescriptor>`.
37pub type SchemaDescPtr = Arc<SchemaDescriptor>;
38/// Type alias for `Arc<ColumnDescriptor>`.
39pub type ColumnDescPtr = Arc<ColumnDescriptor>;
40
41/// Representation of a Parquet type.
42///
43/// Used to describe primitive leaf fields and structs, including top-level schema.
44///
45/// Note that the top-level schema is represented using [`Type::GroupType`] whose
46/// repetition is `None`.
47#[derive(Clone, Debug, PartialEq)]
48pub enum Type {
49    /// Represents a primitive leaf field.
50    PrimitiveType {
51        /// Basic information about the type.
52        basic_info: BasicTypeInfo,
53        /// Physical type of this primitive type.
54        physical_type: PhysicalType,
55        /// Length of this type.
56        type_length: i32,
57        /// Scale of this type.
58        scale: i32,
59        /// Precision of this type.
60        precision: i32,
61    },
62    /// Represents a group of fields (similar to struct).
63    GroupType {
64        /// Basic information about the type.
65        basic_info: BasicTypeInfo,
66        /// Fields of this group type.
67        fields: Vec<TypePtr>,
68    },
69}
70
71impl HeapSize for Type {
72    fn heap_size(&self) -> usize {
73        match self {
74            Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
75            Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
76        }
77    }
78}
79
80impl Type {
81    /// Creates primitive type builder with provided field name and physical type.
82    pub fn primitive_type_builder(
83        name: &str,
84        physical_type: PhysicalType,
85    ) -> PrimitiveTypeBuilder<'_> {
86        PrimitiveTypeBuilder::new(name, physical_type)
87    }
88
89    /// Creates group type builder with provided column name.
90    pub fn group_type_builder(name: &str) -> GroupTypeBuilder<'_> {
91        GroupTypeBuilder::new(name)
92    }
93
94    /// Returns [`BasicTypeInfo`] information about the type.
95    pub fn get_basic_info(&self) -> &BasicTypeInfo {
96        match *self {
97            Type::PrimitiveType { ref basic_info, .. } => basic_info,
98            Type::GroupType { ref basic_info, .. } => basic_info,
99        }
100    }
101
102    /// Returns this type's field name.
103    pub fn name(&self) -> &str {
104        self.get_basic_info().name()
105    }
106
107    /// Gets the fields from this group type.
108    /// Note that this will panic if called on a non-group type.
109    // TODO: should we return `&[&Type]` here?
110    pub fn get_fields(&self) -> &[TypePtr] {
111        match *self {
112            Type::GroupType { ref fields, .. } => &fields[..],
113            _ => panic!("Cannot call get_fields() on a non-group type"),
114        }
115    }
116
117    /// Gets physical type of this primitive type.
118    /// Note that this will panic if called on a non-primitive type.
119    pub fn get_physical_type(&self) -> PhysicalType {
120        match *self {
121            Type::PrimitiveType {
122                basic_info: _,
123                physical_type,
124                ..
125            } => physical_type,
126            _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
127        }
128    }
129
130    /// Gets precision of this primitive type.
131    /// Note that this will panic if called on a non-primitive type.
132    pub fn get_precision(&self) -> i32 {
133        match *self {
134            Type::PrimitiveType { precision, .. } => precision,
135            _ => panic!("Cannot call get_precision() on non-primitive type"),
136        }
137    }
138
139    /// Gets scale of this primitive type.
140    /// Note that this will panic if called on a non-primitive type.
141    pub fn get_scale(&self) -> i32 {
142        match *self {
143            Type::PrimitiveType { scale, .. } => scale,
144            _ => panic!("Cannot call get_scale() on non-primitive type"),
145        }
146    }
147
148    /// Checks if `sub_type` schema is part of current schema.
149    /// This method can be used to check if projected columns are part of the root schema.
150    pub fn check_contains(&self, sub_type: &Type) -> bool {
151        // Names match, and repetitions match or not set for both
152        let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
153            && (self.is_schema() && sub_type.is_schema()
154                || !self.is_schema()
155                    && !sub_type.is_schema()
156                    && self.get_basic_info().repetition()
157                        == sub_type.get_basic_info().repetition());
158
159        match *self {
160            Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
161                self.get_physical_type() == sub_type.get_physical_type()
162            }
163            Type::GroupType { .. } if basic_match && sub_type.is_group() => {
164                // build hashmap of name -> TypePtr
165                let mut field_map = HashMap::new();
166                for field in self.get_fields() {
167                    field_map.insert(field.name(), field);
168                }
169
170                for field in sub_type.get_fields() {
171                    if !field_map
172                        .get(field.name())
173                        .map(|tpe| tpe.check_contains(field))
174                        .unwrap_or(false)
175                    {
176                        return false;
177                    }
178                }
179                true
180            }
181            _ => false,
182        }
183    }
184
185    /// Returns `true` if this type is a primitive type, `false` otherwise.
186    pub fn is_primitive(&self) -> bool {
187        matches!(*self, Type::PrimitiveType { .. })
188    }
189
190    /// Returns `true` if this type is a group type, `false` otherwise.
191    pub fn is_group(&self) -> bool {
192        matches!(*self, Type::GroupType { .. })
193    }
194
195    /// Returns `true` if this type is the top-level schema type (message type).
196    pub fn is_schema(&self) -> bool {
197        match *self {
198            Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
199            _ => false,
200        }
201    }
202
203    /// Returns `true` if this type is repeated or optional.
204    /// If this type doesn't have repetition defined, we treat it as required.
205    pub fn is_optional(&self) -> bool {
206        self.get_basic_info().has_repetition()
207            && self.get_basic_info().repetition() != Repetition::REQUIRED
208    }
209
210    /// Returns `true` if this type is annotated as a list.
211    pub(crate) fn is_list(&self) -> bool {
212        if self.is_group() {
213            let basic_info = self.get_basic_info();
214            if let Some(logical_type) = basic_info.logical_type_ref() {
215                return logical_type == &LogicalType::List;
216            }
217            return basic_info.converted_type() == ConvertedType::LIST;
218        }
219        false
220    }
221
222    /// Returns `true` if this type is a group with a single child field that is `repeated`.
223    pub(crate) fn has_single_repeated_child(&self) -> bool {
224        if self.is_group() {
225            let children = self.get_fields();
226            return children.len() == 1
227                && children[0].get_basic_info().has_repetition()
228                && children[0].get_basic_info().repetition() == Repetition::REPEATED;
229        }
230        false
231    }
232}
233
234/// A builder for primitive types. All attributes are optional
235/// except the name and physical type.
236/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used.
237pub struct PrimitiveTypeBuilder<'a> {
238    name: &'a str,
239    repetition: Repetition,
240    physical_type: PhysicalType,
241    converted_type: ConvertedType,
242    logical_type: Option<LogicalType>,
243    length: i32,
244    precision: i32,
245    scale: i32,
246    id: Option<i32>,
247}
248
249impl<'a> PrimitiveTypeBuilder<'a> {
250    /// Creates new primitive type builder with provided field name and physical type.
251    pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
252        Self {
253            name,
254            repetition: Repetition::OPTIONAL,
255            physical_type,
256            converted_type: ConvertedType::NONE,
257            logical_type: None,
258            length: -1,
259            precision: -1,
260            scale: -1,
261            id: None,
262        }
263    }
264
265    /// Sets [`Repetition`] for this field and returns itself.
266    pub fn with_repetition(self, repetition: Repetition) -> Self {
267        Self { repetition, ..self }
268    }
269
270    /// Sets [`ConvertedType`] for this field and returns itself.
271    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
272        Self {
273            converted_type,
274            ..self
275        }
276    }
277
278    /// Sets [`LogicalType`] for this field and returns itself.
279    /// If only the logical type is populated for a primitive type, the converted type
280    /// will be automatically populated, and can thus be omitted.
281    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
282        Self {
283            logical_type,
284            ..self
285        }
286    }
287
288    /// Sets type length and returns itself.
289    /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because
290    /// they maintain fixed size underlying byte array.
291    /// By default, value is `0`.
292    pub fn with_length(self, length: i32) -> Self {
293        Self { length, ..self }
294    }
295
296    /// Sets precision for Parquet DECIMAL physical type and returns itself.
297    /// By default, it equals to `0` and used only for decimal context.
298    pub fn with_precision(self, precision: i32) -> Self {
299        Self { precision, ..self }
300    }
301
302    /// Sets scale for Parquet DECIMAL physical type and returns itself.
303    /// By default, it equals to `0` and used only for decimal context.
304    pub fn with_scale(self, scale: i32) -> Self {
305        Self { scale, ..self }
306    }
307
308    /// Sets optional field id and returns itself.
309    pub fn with_id(self, id: Option<i32>) -> Self {
310        Self { id, ..self }
311    }
312
313    /// Creates a new `PrimitiveType` instance from the collected attributes.
314    /// Returns `Err` in case of any building conditions are not met.
315    pub fn build(self) -> Result<Type> {
316        let mut basic_info = BasicTypeInfo {
317            name: String::from(self.name),
318            repetition: Some(self.repetition),
319            converted_type: self.converted_type,
320            logical_type: self.logical_type.clone(),
321            id: self.id,
322        };
323
324        // Check length before logical type, since it is used for logical type validation.
325        if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
326            return Err(general_err!(
327                "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
328                self.length,
329                self.name
330            ));
331        }
332
333        if let Some(logical_type) = &self.logical_type {
334            // If a converted type is populated, check that it is consistent with
335            // its logical type
336            if self.converted_type != ConvertedType::NONE {
337                if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
338                    return Err(general_err!(
339                        "Logical type {:?} is incompatible with converted type {} for field '{}'",
340                        logical_type,
341                        self.converted_type,
342                        self.name
343                    ));
344                }
345            } else {
346                // Populate the converted type for backwards compatibility
347                basic_info.converted_type = self.logical_type.clone().into();
348            }
349            // Check that logical type and physical type are compatible
350            match (logical_type, self.physical_type) {
351                (LogicalType::Map, _) | (LogicalType::List, _) => {
352                    return Err(general_err!(
353                        "{:?} cannot be applied to a primitive type for field '{}'",
354                        logical_type,
355                        self.name
356                    ));
357                }
358                (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
359                (LogicalType::Decimal { scale, precision }, _) => {
360                    // Check that scale and precision are consistent with legacy values
361                    if *scale != self.scale {
362                        return Err(general_err!(
363                            "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
364                            scale,
365                            self.scale,
366                            self.name
367                        ));
368                    }
369                    if *precision != self.precision {
370                        return Err(general_err!(
371                            "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
372                            precision,
373                            self.precision,
374                            self.name
375                        ));
376                    }
377                    self.check_decimal_precision_scale()?;
378                }
379                (LogicalType::Date, PhysicalType::INT32) => {}
380                (
381                    LogicalType::Time {
382                        unit: TimeUnit::MILLIS,
383                        ..
384                    },
385                    PhysicalType::INT32,
386                ) => {}
387                (LogicalType::Time { unit, .. }, PhysicalType::INT64) => {
388                    if *unit == TimeUnit::MILLIS {
389                        return Err(general_err!(
390                            "Cannot use millisecond unit on INT64 type for field '{}'",
391                            self.name
392                        ));
393                    }
394                }
395                (LogicalType::Timestamp { .. }, PhysicalType::INT64) => {}
396                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT32)
397                    if *bit_width <= 32 => {}
398                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
399                    if *bit_width == 64 => {}
400                // Null type
401                (LogicalType::Unknown, _) => {}
402                (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
403                (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
404                (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
405                (LogicalType::Geometry { .. }, PhysicalType::BYTE_ARRAY) => {}
406                (LogicalType::Geography { .. }, PhysicalType::BYTE_ARRAY) => {}
407                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
408                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
409                    return Err(general_err!(
410                        "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
411                        self.name
412                    ));
413                }
414                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 2 => {}
415                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
416                    return Err(general_err!(
417                        "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
418                        self.name
419                    ));
420                }
421                // unknown logical type means just use physical type
422                (LogicalType::_Unknown { .. }, _) => {}
423                (a, b) => {
424                    return Err(general_err!(
425                        "Cannot annotate {:?} from {} for field '{}'",
426                        a,
427                        b,
428                        self.name
429                    ));
430                }
431            }
432        }
433
434        match self.converted_type {
435            ConvertedType::NONE => {}
436            ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
437                if self.physical_type != PhysicalType::BYTE_ARRAY {
438                    return Err(general_err!(
439                        "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
440                        self.converted_type,
441                        self.name
442                    ));
443                }
444            }
445            ConvertedType::DECIMAL => {
446                self.check_decimal_precision_scale()?;
447            }
448            ConvertedType::DATE
449            | ConvertedType::TIME_MILLIS
450            | ConvertedType::UINT_8
451            | ConvertedType::UINT_16
452            | ConvertedType::UINT_32
453            | ConvertedType::INT_8
454            | ConvertedType::INT_16
455            | ConvertedType::INT_32 => {
456                if self.physical_type != PhysicalType::INT32 {
457                    return Err(general_err!(
458                        "{} cannot annotate field '{}' because it is not a INT32 field",
459                        self.converted_type,
460                        self.name
461                    ));
462                }
463            }
464            ConvertedType::TIME_MICROS
465            | ConvertedType::TIMESTAMP_MILLIS
466            | ConvertedType::TIMESTAMP_MICROS
467            | ConvertedType::UINT_64
468            | ConvertedType::INT_64 => {
469                if self.physical_type != PhysicalType::INT64 {
470                    return Err(general_err!(
471                        "{} cannot annotate field '{}' because it is not a INT64 field",
472                        self.converted_type,
473                        self.name
474                    ));
475                }
476            }
477            ConvertedType::INTERVAL => {
478                if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
479                    return Err(general_err!(
480                        "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
481                        self.name
482                    ));
483                }
484            }
485            ConvertedType::ENUM => {
486                if self.physical_type != PhysicalType::BYTE_ARRAY {
487                    return Err(general_err!(
488                        "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
489                        self.name
490                    ));
491                }
492            }
493            _ => {
494                return Err(general_err!(
495                    "{} cannot be applied to primitive field '{}'",
496                    self.converted_type,
497                    self.name
498                ));
499            }
500        }
501
502        Ok(Type::PrimitiveType {
503            basic_info,
504            physical_type: self.physical_type,
505            type_length: self.length,
506            scale: self.scale,
507            precision: self.precision,
508        })
509    }
510
511    #[inline]
512    fn check_decimal_precision_scale(&self) -> Result<()> {
513        match self.physical_type {
514            PhysicalType::INT32
515            | PhysicalType::INT64
516            | PhysicalType::BYTE_ARRAY
517            | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
518            _ => {
519                return Err(general_err!(
520                    "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
521                ));
522            }
523        }
524
525        // Precision is required and must be a non-zero positive integer.
526        if self.precision < 1 {
527            return Err(general_err!(
528                "Invalid DECIMAL precision: {}",
529                self.precision
530            ));
531        }
532
533        // Scale must be zero or a positive integer less than the precision.
534        if self.scale < 0 {
535            return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
536        }
537
538        if self.scale > self.precision {
539            return Err(general_err!(
540                "Invalid DECIMAL: scale ({}) cannot be greater than precision \
541             ({})",
542                self.scale,
543                self.precision
544            ));
545        }
546
547        // Check precision and scale based on physical type limitations.
548        match self.physical_type {
549            PhysicalType::INT32 => {
550                if self.precision > 9 {
551                    return Err(general_err!(
552                        "Cannot represent INT32 as DECIMAL with precision {}",
553                        self.precision
554                    ));
555                }
556            }
557            PhysicalType::INT64 => {
558                if self.precision > 18 {
559                    return Err(general_err!(
560                        "Cannot represent INT64 as DECIMAL with precision {}",
561                        self.precision
562                    ));
563                }
564            }
565            PhysicalType::FIXED_LEN_BYTE_ARRAY => {
566                let length = self
567                    .length
568                    .checked_mul(8)
569                    .ok_or(general_err!("Invalid length {} for Decimal", self.length))?;
570                let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32;
571
572                if self.precision > max_precision {
573                    return Err(general_err!(
574                        "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
575                        precision {}. The max precision can only be {}",
576                        self.length,
577                        self.precision,
578                        max_precision
579                    ));
580                }
581            }
582            _ => (), // For BYTE_ARRAY precision is not limited
583        }
584
585        Ok(())
586    }
587}
588
589/// A builder for group types. All attributes are optional except the name.
590/// Note that if not specified explicitly, `None` is used as the repetition of the group,
591/// which means it is a root (message) type.
592pub struct GroupTypeBuilder<'a> {
593    name: &'a str,
594    repetition: Option<Repetition>,
595    converted_type: ConvertedType,
596    logical_type: Option<LogicalType>,
597    fields: Vec<TypePtr>,
598    id: Option<i32>,
599}
600
601impl<'a> GroupTypeBuilder<'a> {
602    /// Creates new group type builder with provided field name.
603    pub fn new(name: &'a str) -> Self {
604        Self {
605            name,
606            repetition: None,
607            converted_type: ConvertedType::NONE,
608            logical_type: None,
609            fields: Vec::new(),
610            id: None,
611        }
612    }
613
614    /// Sets [`Repetition`] for this field and returns itself.
615    pub fn with_repetition(mut self, repetition: Repetition) -> Self {
616        self.repetition = Some(repetition);
617        self
618    }
619
620    /// Sets [`ConvertedType`] for this field and returns itself.
621    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
622        Self {
623            converted_type,
624            ..self
625        }
626    }
627
628    /// Sets [`LogicalType`] for this field and returns itself.
629    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
630        Self {
631            logical_type,
632            ..self
633        }
634    }
635
636    /// Sets a list of fields that should be child nodes of this field.
637    /// Returns updated self.
638    pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
639        Self { fields, ..self }
640    }
641
642    /// Sets optional field id and returns itself.
643    pub fn with_id(self, id: Option<i32>) -> Self {
644        Self { id, ..self }
645    }
646
647    /// Creates a new `GroupType` instance from the gathered attributes.
648    pub fn build(self) -> Result<Type> {
649        let mut basic_info = BasicTypeInfo {
650            name: String::from(self.name),
651            repetition: self.repetition,
652            converted_type: self.converted_type,
653            logical_type: self.logical_type.clone(),
654            id: self.id,
655        };
656        // Populate the converted type if only the logical type is populated
657        if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
658            basic_info.converted_type = self.logical_type.into();
659        }
660        Ok(Type::GroupType {
661            basic_info,
662            fields: self.fields,
663        })
664    }
665}
666
667/// Basic type info. This contains information such as the name of the type,
668/// the repetition level, the logical type and the kind of the type (group, primitive).
669#[derive(Clone, Debug, PartialEq, Eq)]
670pub struct BasicTypeInfo {
671    name: String,
672    repetition: Option<Repetition>,
673    converted_type: ConvertedType,
674    logical_type: Option<LogicalType>,
675    id: Option<i32>,
676}
677
678impl HeapSize for BasicTypeInfo {
679    fn heap_size(&self) -> usize {
680        // no heap allocations in any other subfield
681        self.name.heap_size()
682    }
683}
684
685impl BasicTypeInfo {
686    /// Returns field name.
687    pub fn name(&self) -> &str {
688        &self.name
689    }
690
691    /// Returns `true` if type has repetition field set, `false` otherwise.
692    /// This is mostly applied to group type, because primitive type always has
693    /// repetition set.
694    pub fn has_repetition(&self) -> bool {
695        self.repetition.is_some()
696    }
697
698    /// Returns [`Repetition`] value for the type.
699    pub fn repetition(&self) -> Repetition {
700        assert!(self.repetition.is_some());
701        self.repetition.unwrap()
702    }
703
704    /// Returns [`ConvertedType`] value for the type.
705    pub fn converted_type(&self) -> ConvertedType {
706        self.converted_type
707    }
708
709    /// Returns [`LogicalType`] value for the type.
710    ///
711    /// Note that this function will clone the `LogicalType`. If performance is a concern,
712    /// use [`Self::logical_type_ref`] instead.
713    #[deprecated(
714        since = "57.1.0",
715        note = "use `BasicTypeInfo::logical_type_ref` instead (LogicalType cloning is non trivial)"
716    )]
717    pub fn logical_type(&self) -> Option<LogicalType> {
718        // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it
719        self.logical_type.clone()
720    }
721
722    /// Return a reference to the [`LogicalType`] value for the type.
723    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
724        self.logical_type.as_ref()
725    }
726
727    /// Returns `true` if id is set, `false` otherwise.
728    pub fn has_id(&self) -> bool {
729        self.id.is_some()
730    }
731
732    /// Returns id value for the type.
733    pub fn id(&self) -> i32 {
734        assert!(self.id.is_some());
735        self.id.unwrap()
736    }
737}
738
739// ----------------------------------------------------------------------
740// Parquet descriptor definitions
741
742/// Represents the location of a column in a Parquet schema
743///
744/// # Example: refer to column named `'my_column'`
745/// ```
746/// # use parquet::schema::types::ColumnPath;
747/// let column_path = ColumnPath::from("my_column");
748/// ```
749///
750/// # Example: refer to column named `c` in a nested struct `{a: {b: {c: ...}}}`
751/// ```
752/// # use parquet::schema::types::ColumnPath;
753/// // form path 'a.b.c'
754/// let column_path = ColumnPath::from(vec![
755///   String::from("a"),
756///   String::from("b"),
757///   String::from("c")
758/// ]);
759/// ```
760#[derive(Clone, PartialEq, Debug, Eq, Hash)]
761pub struct ColumnPath {
762    parts: Vec<String>,
763}
764
765impl HeapSize for ColumnPath {
766    fn heap_size(&self) -> usize {
767        self.parts.heap_size()
768    }
769}
770
771impl ColumnPath {
772    /// Creates new column path from vector of field names.
773    pub fn new(parts: Vec<String>) -> Self {
774        ColumnPath { parts }
775    }
776
777    /// Returns string representation of this column path.
778    /// ```rust
779    /// use parquet::schema::types::ColumnPath;
780    ///
781    /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
782    /// assert_eq!(&path.string(), "a.b.c");
783    /// ```
784    pub fn string(&self) -> String {
785        self.parts.join(".")
786    }
787
788    /// Appends more components to end of column path.
789    /// ```rust
790    /// use parquet::schema::types::ColumnPath;
791    ///
792    /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c"
793    /// .to_string()]);
794    /// assert_eq!(&path.string(), "a.b.c");
795    ///
796    /// path.append(vec!["d".to_string(), "e".to_string()]);
797    /// assert_eq!(&path.string(), "a.b.c.d.e");
798    /// ```
799    pub fn append(&mut self, mut tail: Vec<String>) {
800        self.parts.append(&mut tail);
801    }
802
803    /// Returns a slice of path components.
804    pub fn parts(&self) -> &[String] {
805        &self.parts
806    }
807}
808
809impl fmt::Display for ColumnPath {
810    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
811        write!(f, "{:?}", self.string())
812    }
813}
814
815impl From<Vec<String>> for ColumnPath {
816    fn from(parts: Vec<String>) -> Self {
817        ColumnPath { parts }
818    }
819}
820
821impl From<&str> for ColumnPath {
822    fn from(single_path: &str) -> Self {
823        let s = String::from(single_path);
824        ColumnPath::from(s)
825    }
826}
827
828impl From<String> for ColumnPath {
829    fn from(single_path: String) -> Self {
830        let v = vec![single_path];
831        ColumnPath { parts: v }
832    }
833}
834
835impl AsRef<[String]> for ColumnPath {
836    fn as_ref(&self) -> &[String] {
837        &self.parts
838    }
839}
840
841/// Physical type for leaf-level primitive columns.
842///
843/// Also includes the maximum definition and repetition levels required to
844/// re-assemble nested data.
845#[derive(Debug, PartialEq)]
846pub struct ColumnDescriptor {
847    /// The "leaf" primitive type of this column
848    primitive_type: TypePtr,
849
850    /// The maximum definition level for this column
851    max_def_level: i16,
852
853    /// The maximum repetition level for this column
854    max_rep_level: i16,
855
856    /// The definition level at the nearest REPEATED ancestor, or 0 if none.
857    repeated_ancestor_def_level: i16,
858
859    /// The path of this column. For instance, "a.b.c.d".
860    path: ColumnPath,
861}
862
863impl HeapSize for ColumnDescriptor {
864    fn heap_size(&self) -> usize {
865        // Don't include the heap size of primitive_type, this is already
866        // accounted for via SchemaDescriptor::schema
867        self.path.heap_size()
868    }
869}
870
871impl ColumnDescriptor {
872    /// Creates new descriptor for leaf-level column.
873    pub fn new(
874        primitive_type: TypePtr,
875        max_def_level: i16,
876        max_rep_level: i16,
877        path: ColumnPath,
878    ) -> Self {
879        Self::new_with_repeated_ancestor(primitive_type, max_def_level, max_rep_level, path, 0)
880    }
881
882    pub(crate) fn new_with_repeated_ancestor(
883        primitive_type: TypePtr,
884        max_def_level: i16,
885        max_rep_level: i16,
886        path: ColumnPath,
887        repeated_ancestor_def_level: i16,
888    ) -> Self {
889        Self {
890            primitive_type,
891            max_def_level,
892            max_rep_level,
893            repeated_ancestor_def_level,
894            path,
895        }
896    }
897
898    /// Returns maximum definition level for this column.
899    #[inline]
900    pub fn max_def_level(&self) -> i16 {
901        self.max_def_level
902    }
903
904    /// Returns maximum repetition level for this column.
905    #[inline]
906    pub fn max_rep_level(&self) -> i16 {
907        self.max_rep_level
908    }
909
910    /// Returns the definition level at the nearest REPEATED ancestor, or 0 if none.
911    #[inline]
912    pub fn repeated_ancestor_def_level(&self) -> i16 {
913        self.repeated_ancestor_def_level
914    }
915
916    /// Returns [`ColumnPath`] for this column.
917    pub fn path(&self) -> &ColumnPath {
918        &self.path
919    }
920
921    /// Returns self type [`Type`] for this leaf column.
922    pub fn self_type(&self) -> &Type {
923        self.primitive_type.as_ref()
924    }
925
926    /// Returns self type [`TypePtr`]  for this leaf
927    /// column.
928    pub fn self_type_ptr(&self) -> TypePtr {
929        self.primitive_type.clone()
930    }
931
932    /// Returns column name.
933    pub fn name(&self) -> &str {
934        self.primitive_type.name()
935    }
936
937    /// Returns [`ConvertedType`] for this column.
938    pub fn converted_type(&self) -> ConvertedType {
939        self.primitive_type.get_basic_info().converted_type()
940    }
941
942    /// Returns [`LogicalType`] for this column.
943    ///
944    /// Note that this function will clone the `LogicalType`. If performance is a concern,
945    /// use [`Self::logical_type_ref`] instead.
946    #[deprecated(
947        since = "57.1.0",
948        note = "use `ColumnDescriptor::logical_type_ref` instead (LogicalType cloning is non trivial)"
949    )]
950    pub fn logical_type(&self) -> Option<LogicalType> {
951        self.primitive_type
952            .get_basic_info()
953            .logical_type_ref()
954            .cloned()
955    }
956
957    /// Returns a reference to the [`LogicalType`] for this column.
958    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
959        self.primitive_type.get_basic_info().logical_type_ref()
960    }
961
962    /// Returns physical type for this column.
963    /// Note that it will panic if called on a non-primitive type.
964    pub fn physical_type(&self) -> PhysicalType {
965        match self.primitive_type.as_ref() {
966            Type::PrimitiveType { physical_type, .. } => *physical_type,
967            _ => panic!("Expected primitive type!"),
968        }
969    }
970
971    /// Returns type length for this column.
972    /// Note that it will panic if called on a non-primitive type.
973    pub fn type_length(&self) -> i32 {
974        match self.primitive_type.as_ref() {
975            Type::PrimitiveType { type_length, .. } => *type_length,
976            _ => panic!("Expected primitive type!"),
977        }
978    }
979
980    /// Returns type precision for this column.
981    /// Note that it will panic if called on a non-primitive type.
982    pub fn type_precision(&self) -> i32 {
983        match self.primitive_type.as_ref() {
984            Type::PrimitiveType { precision, .. } => *precision,
985            _ => panic!("Expected primitive type!"),
986        }
987    }
988
989    /// Returns type scale for this column.
990    /// Note that it will panic if called on a non-primitive type.
991    pub fn type_scale(&self) -> i32 {
992        match self.primitive_type.as_ref() {
993            Type::PrimitiveType { scale, .. } => *scale,
994            _ => panic!("Expected primitive type!"),
995        }
996    }
997
998    /// Returns the sort order for this column
999    pub fn sort_order(&self) -> SortOrder {
1000        ColumnOrder::sort_order_for_type(
1001            self.logical_type_ref(),
1002            self.converted_type(),
1003            self.physical_type(),
1004        )
1005    }
1006}
1007
1008/// Schema of a Parquet file.
1009///
1010/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
1011/// each primitive (leaf) column.
1012///
1013/// # Example
1014/// ```
1015/// # use std::sync::Arc;
1016/// use parquet::schema::types::{SchemaDescriptor, Type};
1017/// use parquet::basic; // note there are two `Type`s that are different
1018/// // Schema for a table with two columns: "a" (int64) and "b" (int32, stored as a date)
1019/// let descriptor = SchemaDescriptor::new(
1020///   Arc::new(
1021///     Type::group_type_builder("my_schema")
1022///       .with_fields(vec![
1023///         Arc::new(
1024///          Type::primitive_type_builder("a", basic::Type::INT64)
1025///           .build().unwrap()
1026///         ),
1027///         Arc::new(
1028///          Type::primitive_type_builder("b", basic::Type::INT32)
1029///           .with_converted_type(basic::ConvertedType::DATE)
1030///           .with_logical_type(Some(basic::LogicalType::Date))
1031///           .build().unwrap()
1032///         ),
1033///      ])
1034///      .build().unwrap()
1035///   )
1036/// );
1037/// ```
1038#[derive(PartialEq, Clone)]
1039pub struct SchemaDescriptor {
1040    /// The top-level logical schema (the "message" type).
1041    ///
1042    /// This must be a [`Type::GroupType`] where each field is a root
1043    /// column type in the schema.
1044    schema: TypePtr,
1045
1046    /// The descriptors for the physical type of each leaf column in this schema
1047    ///
1048    /// Constructed from `schema` in DFS order.
1049    leaves: Vec<ColumnDescPtr>,
1050
1051    /// Mapping from a leaf column's index to the root column index that it
1052    /// comes from.
1053    ///
1054    /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
1055    /// ```text
1056    /// -- a  <-----+
1057    /// -- -- b     |
1058    /// -- -- -- c  |
1059    /// -- -- -- -- d
1060    /// ```
1061    leaf_to_base: Vec<usize>,
1062}
1063
1064impl fmt::Debug for SchemaDescriptor {
1065    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1066        // Skip leaves and leaf_to_base as they only a cache information already found in `schema`
1067        f.debug_struct("SchemaDescriptor")
1068            .field("schema", &self.schema)
1069            .finish()
1070    }
1071}
1072
1073// Need to implement HeapSize in this module as the fields are private
1074impl HeapSize for SchemaDescriptor {
1075    fn heap_size(&self) -> usize {
1076        self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
1077    }
1078}
1079
1080impl SchemaDescriptor {
1081    /// Creates new schema descriptor from Parquet schema.
1082    pub fn new(tp: TypePtr) -> Self {
1083        const INIT_SCHEMA_DEPTH: usize = 16;
1084        assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
1085        // unwrap should be safe since we just asserted tp is a group
1086        let n_leaves = num_leaves(&tp).unwrap();
1087        let mut leaves = Vec::with_capacity(n_leaves);
1088        let mut leaf_to_base = Vec::with_capacity(n_leaves);
1089        let mut path = Vec::with_capacity(INIT_SCHEMA_DEPTH);
1090        for (root_idx, f) in tp.get_fields().iter().enumerate() {
1091            path.clear();
1092            build_tree(
1093                f,
1094                root_idx,
1095                0,
1096                0,
1097                0,
1098                &mut leaves,
1099                &mut leaf_to_base,
1100                &mut path,
1101            );
1102        }
1103
1104        Self {
1105            schema: tp,
1106            leaves,
1107            leaf_to_base,
1108        }
1109    }
1110
1111    /// Returns [`ColumnDescriptor`] for a field position.
1112    pub fn column(&self, i: usize) -> ColumnDescPtr {
1113        assert!(
1114            i < self.leaves.len(),
1115            "Index out of bound: {} not in [0, {})",
1116            i,
1117            self.leaves.len()
1118        );
1119        self.leaves[i].clone()
1120    }
1121
1122    /// Returns slice of [`ColumnDescriptor`].
1123    pub fn columns(&self) -> &[ColumnDescPtr] {
1124        &self.leaves
1125    }
1126
1127    /// Returns number of leaf-level columns.
1128    pub fn num_columns(&self) -> usize {
1129        self.leaves.len()
1130    }
1131
1132    /// Returns column root [`Type`] for a leaf position.
1133    pub fn get_column_root(&self, i: usize) -> &Type {
1134        let result = self.column_root_of(i);
1135        result.as_ref()
1136    }
1137
1138    /// Returns column root [`Type`] pointer for a leaf position.
1139    pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1140        let result = self.column_root_of(i);
1141        result.clone()
1142    }
1143
1144    /// Returns the index of the root column for a field position
1145    pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1146        assert!(
1147            leaf < self.leaves.len(),
1148            "Index out of bound: {} not in [0, {})",
1149            leaf,
1150            self.leaves.len()
1151        );
1152
1153        *self
1154            .leaf_to_base
1155            .get(leaf)
1156            .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1157    }
1158
1159    fn column_root_of(&self, i: usize) -> &TypePtr {
1160        &self.schema.get_fields()[self.get_column_root_idx(i)]
1161    }
1162
1163    /// Returns schema as [`Type`].
1164    pub fn root_schema(&self) -> &Type {
1165        self.schema.as_ref()
1166    }
1167
1168    /// Returns schema as [`TypePtr`] for cheap cloning.
1169    pub fn root_schema_ptr(&self) -> TypePtr {
1170        self.schema.clone()
1171    }
1172
1173    /// Returns schema name.
1174    pub fn name(&self) -> &str {
1175        self.schema.name()
1176    }
1177}
1178
1179// walk tree and count nodes
1180pub(crate) fn num_nodes(tp: &TypePtr) -> Result<usize> {
1181    if !tp.is_group() {
1182        return Err(general_err!("Root schema must be Group type"));
1183    }
1184    let mut n_nodes = 1usize; // count root
1185    for f in tp.get_fields().iter() {
1186        count_nodes(f, &mut n_nodes);
1187    }
1188    Ok(n_nodes)
1189}
1190
1191pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) {
1192    *n_nodes += 1;
1193    if let Type::GroupType { fields, .. } = tp.as_ref() {
1194        for f in fields {
1195            count_nodes(f, n_nodes);
1196        }
1197    }
1198}
1199
1200// do a quick walk of the tree to get proper sizing for SchemaDescriptor arrays
1201fn num_leaves(tp: &TypePtr) -> Result<usize> {
1202    if !tp.is_group() {
1203        return Err(general_err!("Root schema must be Group type"));
1204    }
1205    let mut n_leaves = 0usize;
1206    for f in tp.get_fields().iter() {
1207        count_leaves(f, &mut n_leaves);
1208    }
1209    Ok(n_leaves)
1210}
1211
1212fn count_leaves(tp: &TypePtr, n_leaves: &mut usize) {
1213    match tp.as_ref() {
1214        Type::PrimitiveType { .. } => *n_leaves += 1,
1215        Type::GroupType { fields, .. } => {
1216            for f in fields {
1217                count_leaves(f, n_leaves);
1218            }
1219        }
1220    }
1221}
1222
1223#[allow(clippy::too_many_arguments)]
1224fn build_tree<'a>(
1225    tp: &'a TypePtr,
1226    root_idx: usize,
1227    mut max_rep_level: i16,
1228    mut max_def_level: i16,
1229    mut repeated_ancestor_def_level: i16,
1230    leaves: &mut Vec<ColumnDescPtr>,
1231    leaf_to_base: &mut Vec<usize>,
1232    path_so_far: &mut Vec<&'a str>,
1233) {
1234    assert!(tp.get_basic_info().has_repetition());
1235
1236    path_so_far.push(tp.name());
1237    match tp.get_basic_info().repetition() {
1238        Repetition::OPTIONAL => {
1239            max_def_level += 1;
1240        }
1241        Repetition::REPEATED => {
1242            max_def_level += 1;
1243            max_rep_level += 1;
1244            repeated_ancestor_def_level = max_def_level;
1245        }
1246        _ => {}
1247    }
1248
1249    match tp.as_ref() {
1250        Type::PrimitiveType { .. } => {
1251            let mut path: Vec<String> = vec![];
1252            path.extend(path_so_far.iter().copied().map(String::from));
1253            let desc = ColumnDescriptor::new_with_repeated_ancestor(
1254                tp.clone(),
1255                max_def_level,
1256                max_rep_level,
1257                ColumnPath::new(path),
1258                repeated_ancestor_def_level,
1259            );
1260            leaves.push(Arc::new(desc));
1261            leaf_to_base.push(root_idx);
1262        }
1263        Type::GroupType { fields, .. } => {
1264            for f in fields {
1265                build_tree(
1266                    f,
1267                    root_idx,
1268                    max_rep_level,
1269                    max_def_level,
1270                    repeated_ancestor_def_level,
1271                    leaves,
1272                    leaf_to_base,
1273                    path_so_far,
1274                );
1275                path_so_far.pop();
1276            }
1277        }
1278    }
1279}
1280
1281/// Checks if the logical type is valid.
1282fn check_logical_type(logical_type: &Option<LogicalType>) -> Result<()> {
1283    if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type {
1284        if bit_width != 8 && bit_width != 16 && bit_width != 32 && bit_width != 64 {
1285            return Err(general_err!(
1286                "Bit width must be 8, 16, 32, or 64 for Integer logical type"
1287            ));
1288        }
1289    }
1290    Ok(())
1291}
1292
1293// convert thrift decoded array of `SchemaElement` into this crate's representation of
1294// parquet types. this function consumes `elements`.
1295pub(crate) fn parquet_schema_from_array<'a>(elements: Vec<SchemaElement<'a>>) -> Result<TypePtr> {
1296    let mut index = 0;
1297    let num_elements = elements.len();
1298    let mut schema_nodes = Vec::with_capacity(1); // there should only be one element when done
1299
1300    // turn into iterator so we can take ownership of elements of the vector
1301    let mut elements = elements.into_iter();
1302
1303    while index < num_elements {
1304        let t = schema_from_array_helper(&mut elements, num_elements, index)?;
1305        index = t.0;
1306        schema_nodes.push(t.1);
1307    }
1308    if schema_nodes.len() != 1 {
1309        return Err(general_err!(
1310            "Expected exactly one root node, but found {}",
1311            schema_nodes.len()
1312        ));
1313    }
1314
1315    if !schema_nodes[0].is_group() {
1316        return Err(general_err!("Expected root node to be a group type"));
1317    }
1318
1319    Ok(schema_nodes.remove(0))
1320}
1321
1322// recursive helper function for schema conversion
1323fn schema_from_array_helper<'a>(
1324    elements: &mut IntoIter<SchemaElement<'a>>,
1325    num_elements: usize,
1326    index: usize,
1327) -> Result<(usize, TypePtr)> {
1328    // Whether or not the current node is root (message type).
1329    // There is only one message type node in the schema tree.
1330    let is_root_node = index == 0;
1331
1332    if index >= num_elements {
1333        return Err(general_err!(
1334            "Index out of bound, index = {}, len = {}",
1335            index,
1336            num_elements
1337        ));
1338    }
1339    let element = elements.next().expect("schema vector should not be empty");
1340
1341    // Check for empty schema
1342    if let (true, None | Some(0)) = (is_root_node, element.num_children) {
1343        let builder = Type::group_type_builder(element.name);
1344        return Ok((index + 1, Arc::new(builder.build().unwrap())));
1345    }
1346
1347    let converted_type = element.converted_type.unwrap_or(ConvertedType::NONE);
1348
1349    // LogicalType is prefered to ConvertedType, but both may be present.
1350    let logical_type = element.logical_type;
1351
1352    check_logical_type(&logical_type)?;
1353
1354    let field_id = element.field_id;
1355    match element.num_children {
1356        // From parquet-format:
1357        //   The children count is used to construct the nested relationship.
1358        //   This field is not set when the element is a primitive type
1359        // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we
1360        // have to handle this case too.
1361        None | Some(0) => {
1362            // primitive type
1363            if element.repetition_type.is_none() {
1364                return Err(general_err!(
1365                    "Repetition level must be defined for a primitive type"
1366                ));
1367            }
1368            let repetition = element.repetition_type.unwrap();
1369            if let Some(physical_type) = element.r#type {
1370                let length = element.type_length.unwrap_or(-1);
1371                let scale = element.scale.unwrap_or(-1);
1372                let precision = element.precision.unwrap_or(-1);
1373                let name = element.name;
1374                let builder = Type::primitive_type_builder(name, physical_type)
1375                    .with_repetition(repetition)
1376                    .with_converted_type(converted_type)
1377                    .with_logical_type(logical_type)
1378                    .with_length(length)
1379                    .with_precision(precision)
1380                    .with_scale(scale)
1381                    .with_id(field_id);
1382                Ok((index + 1, Arc::new(builder.build()?)))
1383            } else {
1384                let mut builder = Type::group_type_builder(element.name)
1385                    .with_converted_type(converted_type)
1386                    .with_logical_type(logical_type)
1387                    .with_id(field_id);
1388                if !is_root_node {
1389                    // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1390                    // REPEATED for root node.
1391                    //
1392                    // We only set repetition for group types that are not top-level message
1393                    // type. According to parquet-format:
1394                    //   Root of the schema does not have a repetition_type.
1395                    //   All other types must have one.
1396                    builder = builder.with_repetition(repetition);
1397                }
1398                Ok((index + 1, Arc::new(builder.build().unwrap())))
1399            }
1400        }
1401        Some(n) => {
1402            let repetition = element.repetition_type;
1403
1404            let mut fields = Vec::with_capacity(usize::try_from(n)?);
1405            let mut next_index = index + 1;
1406            for _ in 0..n {
1407                let child_result = schema_from_array_helper(elements, num_elements, next_index)?;
1408                next_index = child_result.0;
1409                fields.push(child_result.1);
1410            }
1411
1412            let mut builder = Type::group_type_builder(element.name)
1413                .with_converted_type(converted_type)
1414                .with_logical_type(logical_type)
1415                .with_fields(fields)
1416                .with_id(field_id);
1417
1418            // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1419            // REPEATED for root node.
1420            //
1421            // We only set repetition for group types that are not top-level message
1422            // type. According to parquet-format:
1423            //   Root of the schema does not have a repetition_type.
1424            //   All other types must have one.
1425            if !is_root_node {
1426                let Some(rep) = repetition else {
1427                    return Err(general_err!(
1428                        "Repetition level must be defined for non-root types"
1429                    ));
1430                };
1431                builder = builder.with_repetition(rep);
1432            }
1433            Ok((next_index, Arc::new(builder.build()?)))
1434        }
1435    }
1436}
1437
1438#[cfg(test)]
1439mod tests {
1440    use super::*;
1441
1442    use crate::{
1443        file::metadata::thrift::tests::{buf_to_schema_list, roundtrip_schema, schema_to_buf},
1444        schema::parser::parse_message_type,
1445    };
1446
1447    // TODO: add tests for v2 types
1448
1449    #[test]
1450    fn test_primitive_type() {
1451        let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1452            .with_logical_type(Some(LogicalType::integer(32, true)))
1453            .with_id(Some(0))
1454            .build();
1455        assert!(result.is_ok());
1456
1457        if let Ok(tp) = result {
1458            assert!(tp.is_primitive());
1459            assert!(!tp.is_group());
1460            let basic_info = tp.get_basic_info();
1461            assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1462            assert_eq!(
1463                basic_info.logical_type_ref(),
1464                Some(&LogicalType::integer(32, true))
1465            );
1466            assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1467            assert_eq!(basic_info.id(), 0);
1468            match tp {
1469                Type::PrimitiveType { physical_type, .. } => {
1470                    assert_eq!(physical_type, PhysicalType::INT32);
1471                }
1472                _ => panic!(),
1473            }
1474        }
1475
1476        // Test illegal inputs with logical type
1477        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1478            .with_repetition(Repetition::REPEATED)
1479            .with_logical_type(Some(LogicalType::integer(8, true)))
1480            .build();
1481        assert!(result.is_err());
1482        if let Err(e) = result {
1483            assert_eq!(
1484                format!("{e}"),
1485                "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'"
1486            );
1487        }
1488
1489        // Test illegal inputs with converted type
1490        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1491            .with_repetition(Repetition::REPEATED)
1492            .with_converted_type(ConvertedType::BSON)
1493            .build();
1494        assert!(result.is_err());
1495        if let Err(e) = result {
1496            assert_eq!(
1497                format!("{e}"),
1498                "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1499            );
1500        }
1501
1502        result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1503            .with_repetition(Repetition::REQUIRED)
1504            .with_converted_type(ConvertedType::DECIMAL)
1505            .with_precision(-1)
1506            .with_scale(-1)
1507            .build();
1508        assert!(result.is_err());
1509        if let Err(e) = result {
1510            assert_eq!(
1511                format!("{e}"),
1512                "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1513            );
1514        }
1515
1516        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1517            .with_repetition(Repetition::REQUIRED)
1518            .with_logical_type(Some(LogicalType::decimal(32, 12)))
1519            .with_precision(-1)
1520            .with_scale(-1)
1521            .build();
1522        assert!(result.is_err());
1523        if let Err(e) = result {
1524            assert_eq!(
1525                format!("{e}"),
1526                "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1527            );
1528        }
1529
1530        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1531            .with_repetition(Repetition::REQUIRED)
1532            .with_converted_type(ConvertedType::DECIMAL)
1533            .with_precision(-1)
1534            .with_scale(-1)
1535            .build();
1536        assert!(result.is_err());
1537        if let Err(e) = result {
1538            assert_eq!(
1539                format!("{e}"),
1540                "Parquet error: Invalid DECIMAL precision: -1"
1541            );
1542        }
1543
1544        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1545            .with_repetition(Repetition::REQUIRED)
1546            .with_converted_type(ConvertedType::DECIMAL)
1547            .with_precision(0)
1548            .with_scale(-1)
1549            .build();
1550        assert!(result.is_err());
1551        if let Err(e) = result {
1552            assert_eq!(
1553                format!("{e}"),
1554                "Parquet error: Invalid DECIMAL precision: 0"
1555            );
1556        }
1557
1558        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1559            .with_repetition(Repetition::REQUIRED)
1560            .with_converted_type(ConvertedType::DECIMAL)
1561            .with_precision(1)
1562            .with_scale(-1)
1563            .build();
1564        assert!(result.is_err());
1565        if let Err(e) = result {
1566            assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1567        }
1568
1569        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1570            .with_repetition(Repetition::REQUIRED)
1571            .with_converted_type(ConvertedType::DECIMAL)
1572            .with_precision(1)
1573            .with_scale(2)
1574            .build();
1575        assert!(result.is_err());
1576        if let Err(e) = result {
1577            assert_eq!(
1578                format!("{e}"),
1579                "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1580            );
1581        }
1582
1583        // It is OK if precision == scale
1584        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1585            .with_repetition(Repetition::REQUIRED)
1586            .with_converted_type(ConvertedType::DECIMAL)
1587            .with_precision(1)
1588            .with_scale(1)
1589            .build();
1590        assert!(result.is_ok());
1591
1592        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1593            .with_repetition(Repetition::REQUIRED)
1594            .with_converted_type(ConvertedType::DECIMAL)
1595            .with_precision(18)
1596            .with_scale(2)
1597            .build();
1598        assert!(result.is_err());
1599        if let Err(e) = result {
1600            assert_eq!(
1601                format!("{e}"),
1602                "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1603            );
1604        }
1605
1606        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1607            .with_repetition(Repetition::REQUIRED)
1608            .with_converted_type(ConvertedType::DECIMAL)
1609            .with_precision(32)
1610            .with_scale(2)
1611            .build();
1612        assert!(result.is_err());
1613        if let Err(e) = result {
1614            assert_eq!(
1615                format!("{e}"),
1616                "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1617            );
1618        }
1619
1620        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1621            .with_repetition(Repetition::REQUIRED)
1622            .with_converted_type(ConvertedType::DECIMAL)
1623            .with_length(5)
1624            .with_precision(12)
1625            .with_scale(2)
1626            .build();
1627        assert!(result.is_err());
1628        if let Err(e) = result {
1629            assert_eq!(
1630                format!("{e}"),
1631                "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1632            );
1633        }
1634
1635        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1636            .with_repetition(Repetition::REQUIRED)
1637            .with_converted_type(ConvertedType::UINT_8)
1638            .build();
1639        assert!(result.is_err());
1640        if let Err(e) = result {
1641            assert_eq!(
1642                format!("{e}"),
1643                "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1644            );
1645        }
1646
1647        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1648            .with_repetition(Repetition::REQUIRED)
1649            .with_converted_type(ConvertedType::TIME_MICROS)
1650            .build();
1651        assert!(result.is_err());
1652        if let Err(e) = result {
1653            assert_eq!(
1654                format!("{e}"),
1655                "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1656            );
1657        }
1658
1659        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1660            .with_repetition(Repetition::REQUIRED)
1661            .with_converted_type(ConvertedType::INTERVAL)
1662            .build();
1663        assert!(result.is_err());
1664        if let Err(e) = result {
1665            assert_eq!(
1666                format!("{e}"),
1667                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1668            );
1669        }
1670
1671        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1672            .with_repetition(Repetition::REQUIRED)
1673            .with_converted_type(ConvertedType::INTERVAL)
1674            .with_length(1)
1675            .build();
1676        assert!(result.is_err());
1677        if let Err(e) = result {
1678            assert_eq!(
1679                format!("{e}"),
1680                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1681            );
1682        }
1683
1684        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1685            .with_repetition(Repetition::REQUIRED)
1686            .with_converted_type(ConvertedType::ENUM)
1687            .build();
1688        assert!(result.is_err());
1689        if let Err(e) = result {
1690            assert_eq!(
1691                format!("{e}"),
1692                "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1693            );
1694        }
1695
1696        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1697            .with_repetition(Repetition::REQUIRED)
1698            .with_converted_type(ConvertedType::MAP)
1699            .build();
1700        assert!(result.is_err());
1701        if let Err(e) = result {
1702            assert_eq!(
1703                format!("{e}"),
1704                "Parquet error: MAP cannot be applied to primitive field 'foo'"
1705            );
1706        }
1707
1708        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1709            .with_repetition(Repetition::REQUIRED)
1710            .with_converted_type(ConvertedType::DECIMAL)
1711            .with_length(-1)
1712            .build();
1713        assert!(result.is_err());
1714        if let Err(e) = result {
1715            assert_eq!(
1716                format!("{e}"),
1717                "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1718            );
1719        }
1720
1721        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1722            .with_repetition(Repetition::REQUIRED)
1723            .with_logical_type(Some(LogicalType::Float16))
1724            .with_length(2)
1725            .build();
1726        assert!(result.is_ok());
1727
1728        // Can't be other than FIXED_LEN_BYTE_ARRAY for physical type
1729        result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1730            .with_repetition(Repetition::REQUIRED)
1731            .with_logical_type(Some(LogicalType::Float16))
1732            .with_length(2)
1733            .build();
1734        assert!(result.is_err());
1735        if let Err(e) = result {
1736            assert_eq!(
1737                format!("{e}"),
1738                "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1739            );
1740        }
1741
1742        // Must have length 2
1743        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1744            .with_repetition(Repetition::REQUIRED)
1745            .with_logical_type(Some(LogicalType::Float16))
1746            .with_length(4)
1747            .build();
1748        assert!(result.is_err());
1749        if let Err(e) = result {
1750            assert_eq!(
1751                format!("{e}"),
1752                "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1753            );
1754        }
1755
1756        // Must have length 16
1757        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1758            .with_repetition(Repetition::REQUIRED)
1759            .with_logical_type(Some(LogicalType::Uuid))
1760            .with_length(15)
1761            .build();
1762        assert!(result.is_err());
1763        if let Err(e) = result {
1764            assert_eq!(
1765                format!("{e}"),
1766                "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1767            );
1768        }
1769
1770        // test unknown logical types are ok
1771        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1772            .with_logical_type(Some(LogicalType::_Unknown { field_id: 100 }))
1773            .build();
1774        assert!(result.is_ok());
1775    }
1776
1777    #[test]
1778    fn test_group_type() {
1779        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1780            .with_converted_type(ConvertedType::INT_32)
1781            .with_id(Some(0))
1782            .build();
1783        assert!(f1.is_ok());
1784        let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1785            .with_converted_type(ConvertedType::UTF8)
1786            .with_id(Some(1))
1787            .build();
1788        assert!(f2.is_ok());
1789
1790        let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1791
1792        let result = Type::group_type_builder("foo")
1793            .with_repetition(Repetition::REPEATED)
1794            .with_logical_type(Some(LogicalType::List))
1795            .with_fields(fields)
1796            .with_id(Some(1))
1797            .build();
1798        assert!(result.is_ok());
1799
1800        let tp = result.unwrap();
1801        let basic_info = tp.get_basic_info();
1802        assert!(tp.is_group());
1803        assert!(!tp.is_primitive());
1804        assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1805        assert_eq!(basic_info.logical_type_ref(), Some(&LogicalType::List));
1806        assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1807        assert_eq!(basic_info.id(), 1);
1808        assert_eq!(tp.get_fields().len(), 2);
1809        assert_eq!(tp.get_fields()[0].name(), "f1");
1810        assert_eq!(tp.get_fields()[1].name(), "f2");
1811    }
1812
1813    #[test]
1814    fn test_column_descriptor() {
1815        let result = test_column_descriptor_helper();
1816        assert!(
1817            result.is_ok(),
1818            "Expected result to be OK but got err:\n {}",
1819            result.unwrap_err()
1820        );
1821    }
1822
1823    fn test_column_descriptor_helper() -> Result<()> {
1824        let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1825            .with_converted_type(ConvertedType::UTF8)
1826            .build()?;
1827
1828        let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1829
1830        assert_eq!(descr.path(), &ColumnPath::from("name"));
1831        assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1832        assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1833        assert_eq!(descr.max_def_level(), 4);
1834        assert_eq!(descr.max_rep_level(), 1);
1835        assert_eq!(descr.name(), "name");
1836        assert_eq!(descr.type_length(), -1);
1837        assert_eq!(descr.type_precision(), -1);
1838        assert_eq!(descr.type_scale(), -1);
1839
1840        Ok(())
1841    }
1842
1843    #[test]
1844    fn test_schema_descriptor() {
1845        let result = test_schema_descriptor_helper();
1846        assert!(
1847            result.is_ok(),
1848            "Expected result to be OK but got err:\n {}",
1849            result.unwrap_err()
1850        );
1851    }
1852
1853    // A helper fn to avoid handling the results from type creation
1854    fn test_schema_descriptor_helper() -> Result<()> {
1855        let mut fields = vec![];
1856
1857        let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1858            .with_repetition(Repetition::REQUIRED)
1859            .with_converted_type(ConvertedType::INT_32)
1860            .build()?;
1861        fields.push(Arc::new(inta));
1862        let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1863            .with_converted_type(ConvertedType::INT_64)
1864            .build()?;
1865        fields.push(Arc::new(intb));
1866        let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1867            .with_repetition(Repetition::REPEATED)
1868            .with_converted_type(ConvertedType::UTF8)
1869            .build()?;
1870        fields.push(Arc::new(intc));
1871
1872        // 3-level list encoding
1873        let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1874            .with_repetition(Repetition::REQUIRED)
1875            .with_converted_type(ConvertedType::INT_64)
1876            .build()?;
1877        let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1878        let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1879            .with_repetition(Repetition::REPEATED)
1880            .with_converted_type(ConvertedType::INT_32)
1881            .build()?;
1882        let list = Type::group_type_builder("records")
1883            .with_repetition(Repetition::REPEATED)
1884            .with_converted_type(ConvertedType::LIST)
1885            .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1886            .build()?;
1887        let bag = Type::group_type_builder("bag")
1888            .with_repetition(Repetition::OPTIONAL)
1889            .with_fields(vec![Arc::new(list)])
1890            .build()?;
1891        fields.push(Arc::new(bag));
1892
1893        let schema = Type::group_type_builder("schema")
1894            .with_repetition(Repetition::REPEATED)
1895            .with_fields(fields)
1896            .build()?;
1897        let descr = SchemaDescriptor::new(Arc::new(schema));
1898
1899        let nleaves = 6;
1900        assert_eq!(descr.num_columns(), nleaves);
1901
1902        //                             mdef mrep
1903        // required int32 a            0    0
1904        // optional int64 b            1    0
1905        // repeated byte_array c       1    1
1906        // optional group bag          1    0
1907        //   repeated group records    2    1
1908        //     required int64 item1    2    1
1909        //     optional boolean item2  3    1
1910        //     repeated int32 item3    3    2
1911        let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1912        let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1913
1914        for i in 0..nleaves {
1915            let col = descr.column(i);
1916            assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1917            assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1918        }
1919
1920        assert_eq!(descr.column(0).path().string(), "a");
1921        assert_eq!(descr.column(1).path().string(), "b");
1922        assert_eq!(descr.column(2).path().string(), "c");
1923        assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1924        assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1925        assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1926
1927        assert_eq!(descr.get_column_root(0).name(), "a");
1928        assert_eq!(descr.get_column_root(3).name(), "bag");
1929        assert_eq!(descr.get_column_root(4).name(), "bag");
1930        assert_eq!(descr.get_column_root(5).name(), "bag");
1931
1932        Ok(())
1933    }
1934
1935    #[test]
1936    fn test_schema_build_tree_def_rep_levels() {
1937        let message_type = "
1938    message spark_schema {
1939      REQUIRED INT32 a;
1940      OPTIONAL group b {
1941        OPTIONAL INT32 _1;
1942        OPTIONAL INT32 _2;
1943      }
1944      OPTIONAL group c (LIST) {
1945        REPEATED group list {
1946          OPTIONAL INT32 element;
1947        }
1948      }
1949    }
1950    ";
1951        let schema = parse_message_type(message_type).expect("should parse schema");
1952        let descr = SchemaDescriptor::new(Arc::new(schema));
1953        // required int32 a
1954        assert_eq!(descr.column(0).max_def_level(), 0);
1955        assert_eq!(descr.column(0).max_rep_level(), 0);
1956        // optional int32 b._1
1957        assert_eq!(descr.column(1).max_def_level(), 2);
1958        assert_eq!(descr.column(1).max_rep_level(), 0);
1959        // optional int32 b._2
1960        assert_eq!(descr.column(2).max_def_level(), 2);
1961        assert_eq!(descr.column(2).max_rep_level(), 0);
1962        // repeated optional int32 c.list.element
1963        assert_eq!(descr.column(3).max_def_level(), 3);
1964        assert_eq!(descr.column(3).max_rep_level(), 1);
1965    }
1966
1967    #[test]
1968    fn test_schema_build_tree_repeated_ancestor_def_level() {
1969        // Flat columns: no REPEATED ancestor → repeated_ancestor_def_level = 0
1970        let message_type = "
1971    message m {
1972      REQUIRED INT32 a;
1973      OPTIONAL INT32 b;
1974      OPTIONAL group s {
1975        OPTIONAL INT32 x;
1976      }
1977    }
1978    ";
1979        let schema = parse_message_type(message_type).expect("should parse schema");
1980        let descr = SchemaDescriptor::new(Arc::new(schema));
1981        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 0); // a
1982        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 0); // b
1983        assert_eq!(descr.column(2).repeated_ancestor_def_level(), 0); // s.x
1984
1985        // Standard list: OPTIONAL outer, REPEATED group, OPTIONAL element
1986        // repeated_ancestor_def_level is the def_level at the REPEATED group (= 2)
1987        let message_type = "
1988    message m {
1989      OPTIONAL group c (LIST) {
1990        REPEATED group list {
1991          OPTIONAL INT32 element;
1992        }
1993      }
1994    }
1995    ";
1996        let schema = parse_message_type(message_type).expect("should parse schema");
1997        let descr = SchemaDescriptor::new(Arc::new(schema));
1998        // c(optional)=1, list(repeated)=2, element(optional)=3
1999        assert_eq!(descr.column(0).max_def_level(), 3);
2000        assert_eq!(descr.column(0).max_rep_level(), 1);
2001        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2);
2002
2003        // Required list: REQUIRED outer, REPEATED group, REQUIRED element
2004        // No OPTIONAL nodes between REPEATED and leaf, so repeated_ancestor_def_level == max_def_level
2005        let message_type = "
2006    message m {
2007      REQUIRED group c (LIST) {
2008        REPEATED group list {
2009          REQUIRED INT32 element;
2010        }
2011      }
2012    }
2013    ";
2014        let schema = parse_message_type(message_type).expect("should parse schema");
2015        let descr = SchemaDescriptor::new(Arc::new(schema));
2016        // list(repeated)=1, element(required)=1
2017        assert_eq!(descr.column(0).max_def_level(), 1);
2018        assert_eq!(descr.column(0).max_rep_level(), 1);
2019        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 1);
2020
2021        // Nested lists: innermost REPEATED wins
2022        let message_type = "
2023    message m {
2024      OPTIONAL group outer (LIST) {
2025        REPEATED group list {
2026          OPTIONAL group inner (LIST) {
2027            REPEATED group list2 {
2028              OPTIONAL INT32 element;
2029            }
2030          }
2031        }
2032      }
2033    }
2034    ";
2035        let schema = parse_message_type(message_type).expect("should parse schema");
2036        let descr = SchemaDescriptor::new(Arc::new(schema));
2037        // outer(opt)=1, list(rep)=2, inner(opt)=3, list2(rep)=4, element(opt)=5
2038        assert_eq!(descr.column(0).max_def_level(), 5);
2039        assert_eq!(descr.column(0).max_rep_level(), 2);
2040        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 4);
2041
2042        // Struct inside list: all sibling leaves share the same repeated_ancestor_def_level
2043        let message_type = "
2044    message m {
2045      OPTIONAL group bag (LIST) {
2046        REPEATED group list {
2047          REQUIRED group item {
2048            OPTIONAL INT32 x;
2049            REQUIRED INT32 y;
2050          }
2051        }
2052      }
2053    }
2054    ";
2055        let schema = parse_message_type(message_type).expect("should parse schema");
2056        let descr = SchemaDescriptor::new(Arc::new(schema));
2057        // bag(opt)=1, list(rep)=2, item(req)=2, x(opt)=3
2058        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // bag.list.item.x
2059        // bag(opt)=1, list(rep)=2, item(req)=2, y(req)=2
2060        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // bag.list.item.y
2061
2062        // Map type: key (required) and value (optional) under the same REPEATED group
2063        let message_type = "
2064    message m {
2065      OPTIONAL group my_map (MAP) {
2066        REPEATED group key_value {
2067          REQUIRED BYTE_ARRAY key (UTF8);
2068          OPTIONAL INT32 value;
2069        }
2070      }
2071    }
2072    ";
2073        let schema = parse_message_type(message_type).expect("should parse schema");
2074        let descr = SchemaDescriptor::new(Arc::new(schema));
2075        // my_map(opt)=1, key_value(rep)=2, key(req)=2
2076        assert_eq!(descr.column(0).max_def_level(), 2);
2077        assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // key: max_def == repeated_ancestor
2078        // my_map(opt)=1, key_value(rep)=2, value(opt)=3
2079        assert_eq!(descr.column(1).max_def_level(), 3);
2080        assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // value: max_def > repeated_ancestor
2081    }
2082
2083    #[test]
2084    #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
2085    fn test_get_physical_type_panic() {
2086        let list = Type::group_type_builder("records")
2087            .with_repetition(Repetition::REPEATED)
2088            .build()
2089            .unwrap();
2090        list.get_physical_type();
2091    }
2092
2093    #[test]
2094    fn test_get_physical_type_primitive() {
2095        let f = Type::primitive_type_builder("f", PhysicalType::INT64)
2096            .build()
2097            .unwrap();
2098        assert_eq!(f.get_physical_type(), PhysicalType::INT64);
2099
2100        let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
2101            .build()
2102            .unwrap();
2103        assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
2104    }
2105
2106    #[test]
2107    fn test_check_contains_primitive_primitive() {
2108        // OK
2109        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2110            .build()
2111            .unwrap();
2112        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2113            .build()
2114            .unwrap();
2115        assert!(f1.check_contains(&f2));
2116
2117        // OK: different logical type does not affect check_contains
2118        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2119            .with_converted_type(ConvertedType::UINT_8)
2120            .build()
2121            .unwrap();
2122        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2123            .with_converted_type(ConvertedType::UINT_16)
2124            .build()
2125            .unwrap();
2126        assert!(f1.check_contains(&f2));
2127
2128        // KO: different name
2129        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2130            .build()
2131            .unwrap();
2132        let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
2133            .build()
2134            .unwrap();
2135        assert!(!f1.check_contains(&f2));
2136
2137        // KO: different type
2138        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2139            .build()
2140            .unwrap();
2141        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2142            .build()
2143            .unwrap();
2144        assert!(!f1.check_contains(&f2));
2145
2146        // KO: different repetition
2147        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
2148            .with_repetition(Repetition::REQUIRED)
2149            .build()
2150            .unwrap();
2151        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
2152            .with_repetition(Repetition::OPTIONAL)
2153            .build()
2154            .unwrap();
2155        assert!(!f1.check_contains(&f2));
2156    }
2157
2158    // function to create a new group type for testing
2159    fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
2160        Type::group_type_builder(name)
2161            .with_repetition(repetition)
2162            .with_fields(types.into_iter().map(Arc::new).collect())
2163            .build()
2164            .unwrap()
2165    }
2166
2167    #[test]
2168    fn test_check_contains_group_group() {
2169        // OK: should match okay with empty fields
2170        let f1 = Type::group_type_builder("f").build().unwrap();
2171        let f2 = Type::group_type_builder("f").build().unwrap();
2172        assert!(f1.check_contains(&f2));
2173        assert!(!f1.is_optional());
2174
2175        // OK: fields match
2176        let f1 = test_new_group_type(
2177            "f",
2178            Repetition::REPEATED,
2179            vec![
2180                Type::primitive_type_builder("f1", PhysicalType::INT32)
2181                    .build()
2182                    .unwrap(),
2183                Type::primitive_type_builder("f2", PhysicalType::INT64)
2184                    .build()
2185                    .unwrap(),
2186            ],
2187        );
2188        let f2 = test_new_group_type(
2189            "f",
2190            Repetition::REPEATED,
2191            vec![
2192                Type::primitive_type_builder("f1", PhysicalType::INT32)
2193                    .build()
2194                    .unwrap(),
2195                Type::primitive_type_builder("f2", PhysicalType::INT64)
2196                    .build()
2197                    .unwrap(),
2198            ],
2199        );
2200        assert!(f1.check_contains(&f2));
2201
2202        // OK: subset of fields
2203        let f1 = test_new_group_type(
2204            "f",
2205            Repetition::REPEATED,
2206            vec![
2207                Type::primitive_type_builder("f1", PhysicalType::INT32)
2208                    .build()
2209                    .unwrap(),
2210                Type::primitive_type_builder("f2", PhysicalType::INT64)
2211                    .build()
2212                    .unwrap(),
2213            ],
2214        );
2215        let f2 = test_new_group_type(
2216            "f",
2217            Repetition::REPEATED,
2218            vec![
2219                Type::primitive_type_builder("f2", PhysicalType::INT64)
2220                    .build()
2221                    .unwrap(),
2222            ],
2223        );
2224        assert!(f1.check_contains(&f2));
2225
2226        // KO: different name
2227        let f1 = Type::group_type_builder("f1").build().unwrap();
2228        let f2 = Type::group_type_builder("f2").build().unwrap();
2229        assert!(!f1.check_contains(&f2));
2230
2231        // KO: different repetition
2232        let f1 = Type::group_type_builder("f")
2233            .with_repetition(Repetition::OPTIONAL)
2234            .build()
2235            .unwrap();
2236        let f2 = Type::group_type_builder("f")
2237            .with_repetition(Repetition::REPEATED)
2238            .build()
2239            .unwrap();
2240        assert!(!f1.check_contains(&f2));
2241
2242        // KO: different fields
2243        let f1 = test_new_group_type(
2244            "f",
2245            Repetition::REPEATED,
2246            vec![
2247                Type::primitive_type_builder("f1", PhysicalType::INT32)
2248                    .build()
2249                    .unwrap(),
2250                Type::primitive_type_builder("f2", PhysicalType::INT64)
2251                    .build()
2252                    .unwrap(),
2253            ],
2254        );
2255        let f2 = test_new_group_type(
2256            "f",
2257            Repetition::REPEATED,
2258            vec![
2259                Type::primitive_type_builder("f1", PhysicalType::INT32)
2260                    .build()
2261                    .unwrap(),
2262                Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2263                    .build()
2264                    .unwrap(),
2265            ],
2266        );
2267        assert!(!f1.check_contains(&f2));
2268
2269        // KO: different fields
2270        let f1 = test_new_group_type(
2271            "f",
2272            Repetition::REPEATED,
2273            vec![
2274                Type::primitive_type_builder("f1", PhysicalType::INT32)
2275                    .build()
2276                    .unwrap(),
2277                Type::primitive_type_builder("f2", PhysicalType::INT64)
2278                    .build()
2279                    .unwrap(),
2280            ],
2281        );
2282        let f2 = test_new_group_type(
2283            "f",
2284            Repetition::REPEATED,
2285            vec![
2286                Type::primitive_type_builder("f3", PhysicalType::INT32)
2287                    .build()
2288                    .unwrap(),
2289            ],
2290        );
2291        assert!(!f1.check_contains(&f2));
2292    }
2293
2294    #[test]
2295    fn test_check_contains_group_primitive() {
2296        // KO: should not match
2297        let f1 = Type::group_type_builder("f").build().unwrap();
2298        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2299            .build()
2300            .unwrap();
2301        assert!(!f1.check_contains(&f2));
2302        assert!(!f2.check_contains(&f1));
2303
2304        // KO: should not match when primitive field is part of group type
2305        let f1 = test_new_group_type(
2306            "f",
2307            Repetition::REPEATED,
2308            vec![
2309                Type::primitive_type_builder("f1", PhysicalType::INT32)
2310                    .build()
2311                    .unwrap(),
2312            ],
2313        );
2314        let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2315            .build()
2316            .unwrap();
2317        assert!(!f1.check_contains(&f2));
2318        assert!(!f2.check_contains(&f1));
2319
2320        // OK: match nested types
2321        let f1 = test_new_group_type(
2322            "a",
2323            Repetition::REPEATED,
2324            vec![
2325                test_new_group_type(
2326                    "b",
2327                    Repetition::REPEATED,
2328                    vec![
2329                        Type::primitive_type_builder("c", PhysicalType::INT32)
2330                            .build()
2331                            .unwrap(),
2332                    ],
2333                ),
2334                Type::primitive_type_builder("d", PhysicalType::INT64)
2335                    .build()
2336                    .unwrap(),
2337                Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2338                    .build()
2339                    .unwrap(),
2340            ],
2341        );
2342        let f2 = test_new_group_type(
2343            "a",
2344            Repetition::REPEATED,
2345            vec![test_new_group_type(
2346                "b",
2347                Repetition::REPEATED,
2348                vec![
2349                    Type::primitive_type_builder("c", PhysicalType::INT32)
2350                        .build()
2351                        .unwrap(),
2352                ],
2353            )],
2354        );
2355        assert!(f1.check_contains(&f2)); // should match
2356        assert!(!f2.check_contains(&f1)); // should fail
2357    }
2358
2359    #[test]
2360    fn test_schema_type_thrift_conversion_err() {
2361        let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2362            .build()
2363            .unwrap();
2364        let schema = Arc::new(schema);
2365        let thrift_schema = schema_to_buf(&schema);
2366        assert!(thrift_schema.is_err());
2367        if let Err(e) = thrift_schema {
2368            assert_eq!(
2369                format!("{e}"),
2370                "Parquet error: Root schema must be Group type"
2371            );
2372        }
2373    }
2374
2375    #[test]
2376    fn test_schema_type_thrift_conversion() {
2377        let message_type = "
2378    message conversions {
2379      REQUIRED INT64 id;
2380      OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2381      OPTIONAL group int_array_Array (LIST) {
2382        REPEATED group list {
2383          OPTIONAL group element (LIST) {
2384            REPEATED group list {
2385              OPTIONAL INT32 element;
2386            }
2387          }
2388        }
2389      }
2390      OPTIONAL group int_map (MAP) {
2391        REPEATED group map (MAP_KEY_VALUE) {
2392          REQUIRED BYTE_ARRAY key (UTF8);
2393          OPTIONAL INT32 value;
2394        }
2395      }
2396      OPTIONAL group int_Map_Array (LIST) {
2397        REPEATED group list {
2398          OPTIONAL group g (MAP) {
2399            REPEATED group map (MAP_KEY_VALUE) {
2400              REQUIRED BYTE_ARRAY key (UTF8);
2401              OPTIONAL group value {
2402                OPTIONAL group H {
2403                  OPTIONAL group i (LIST) {
2404                    REPEATED group list {
2405                      OPTIONAL DOUBLE element;
2406                    }
2407                  }
2408                }
2409              }
2410            }
2411          }
2412        }
2413      }
2414      OPTIONAL group nested_struct {
2415        OPTIONAL INT32 A;
2416        OPTIONAL group b (LIST) {
2417          REPEATED group list {
2418            REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2419          }
2420        }
2421      }
2422    }
2423    ";
2424        let expected_schema = parse_message_type(message_type).unwrap();
2425        let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap();
2426        assert_eq!(result_schema, Arc::new(expected_schema));
2427    }
2428
2429    #[test]
2430    fn test_schema_type_thrift_conversion_decimal() {
2431        let message_type = "
2432    message decimals {
2433      OPTIONAL INT32 field0;
2434      OPTIONAL INT64 field1 (DECIMAL (18, 2));
2435      OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2436      OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2437    }
2438    ";
2439        let expected_schema = parse_message_type(message_type).unwrap();
2440        let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap();
2441        assert_eq!(result_schema, Arc::new(expected_schema));
2442    }
2443
2444    // Tests schema conversion from thrift, when num_children is set to Some(0) for a
2445    // primitive type.
2446    #[test]
2447    fn test_schema_from_thrift_with_num_children_set() {
2448        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2449        let message_type = "
2450    message schema {
2451      OPTIONAL BYTE_ARRAY id (UTF8);
2452      OPTIONAL BYTE_ARRAY name (UTF8);
2453      OPTIONAL BYTE_ARRAY message (UTF8);
2454      OPTIONAL INT32 type (UINT_8);
2455      OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2456      OPTIONAL INT64 __index_level_0__;
2457    }
2458    ";
2459
2460        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2461        let mut buf = schema_to_buf(&expected_schema).unwrap();
2462        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2463
2464        // Change all of None to Some(0)
2465        for elem in &mut thrift_schema[..] {
2466            if elem.num_children.is_none() {
2467                elem.num_children = Some(0);
2468            }
2469        }
2470
2471        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2472        assert_eq!(result_schema, expected_schema);
2473    }
2474
2475    // Sometimes parquet-cpp sets repetition level for the root node, which is against
2476    // the format definition, but we need to handle it by setting it back to None.
2477    #[test]
2478    fn test_schema_from_thrift_root_has_repetition() {
2479        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2480        let message_type = "
2481    message schema {
2482      OPTIONAL BYTE_ARRAY a (UTF8);
2483      OPTIONAL INT32 b (UINT_8);
2484    }
2485    ";
2486
2487        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2488        let mut buf = schema_to_buf(&expected_schema).unwrap();
2489        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2490        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED);
2491
2492        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2493        assert_eq!(result_schema, expected_schema);
2494    }
2495
2496    #[test]
2497    fn test_schema_from_thrift_group_has_no_child() {
2498        let message_type = "message schema {}";
2499
2500        let expected_schema = Arc::new(parse_message_type(message_type).unwrap());
2501        let mut buf = schema_to_buf(&expected_schema).unwrap();
2502        let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap();
2503        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED);
2504
2505        let result_schema = parquet_schema_from_array(thrift_schema).unwrap();
2506        assert_eq!(result_schema, expected_schema);
2507    }
2508
2509    #[test]
2510    fn test_parquet_schema_from_array_rejects_negative_num_children() {
2511        let elements = vec![SchemaElement {
2512            r#type: None,
2513            type_length: None,
2514            repetition_type: Some(Repetition::REQUIRED),
2515            name: "schema",
2516            num_children: Some(-1),
2517            converted_type: None,
2518            scale: None,
2519            precision: None,
2520            field_id: None,
2521            logical_type: None,
2522        }];
2523        let result = parquet_schema_from_array(elements);
2524        assert!(result.unwrap_err().to_string().contains("Integer overflow"));
2525    }
2526}