parquet/
basic.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains Rust mappings for Thrift definition. This module contains only mappings for thrift
19//! enums and unions. Thrift structs are handled elsewhere.
20//! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift)
21//! file to see raw definitions.
22
23use std::io::Write;
24use std::str::FromStr;
25use std::{fmt, str};
26
27pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
28use crate::file::metadata::HeapSize;
29use crate::parquet_thrift::{
30    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
31    WriteThrift, WriteThriftField,
32};
33use crate::{thrift_enum, thrift_struct, thrift_union_all_empty, write_thrift_field};
34
35use crate::errors::{ParquetError, Result};
36
37// ----------------------------------------------------------------------
38// Types from the Thrift definition
39
40// ----------------------------------------------------------------------
41// Mirrors thrift enum `Type`
42
43thrift_enum!(
44/// Types supported by Parquet.
45///
46/// These physical types are intended to be used in combination with the encodings to
47/// control the on disk storage format.
48/// For example INT16 is not included as a type since a good encoding of INT32
49/// would handle this.
50enum Type {
51  BOOLEAN = 0;
52  INT32 = 1;
53  INT64 = 2;
54  INT96 = 3;  // deprecated, only used by legacy implementations.
55  FLOAT = 4;
56  DOUBLE = 5;
57  BYTE_ARRAY = 6;
58  FIXED_LEN_BYTE_ARRAY = 7;
59}
60);
61
62// ----------------------------------------------------------------------
63// Mirrors thrift enum `ConvertedType`
64
65// TODO(ets): Adding the `NONE` variant to this enum is a bit awkward. We should
66// look into removing it and using `Option<ConvertedType>` instead.
67thrift_enum!(
68/// Common types (converted types) used by frameworks when using Parquet.
69///
70/// This helps map between types in those frameworks to the base types in Parquet.
71/// This is only metadata and not needed to read or write the data.
72///
73/// This struct was renamed from `LogicalType` in version 4.0.0.
74/// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead.
75enum ConvertedType {
76  /// Not defined in the spec, used internally to indicate no type conversion
77  NONE = -1;
78
79  /// A BYTE_ARRAY actually contains UTF8 encoded chars.
80  UTF8 = 0;
81
82  /// A map is converted as an optional field containing a repeated key/value pair.
83  MAP = 1;
84
85  /// A key/value pair is converted into a group of two fields.
86  MAP_KEY_VALUE = 2;
87
88  /// A list is converted into an optional field containing a repeated field for its
89  /// values.
90  LIST = 3;
91
92  /// An enum is converted into a BYTE_ARRAY field
93  ENUM = 4;
94
95  /// A decimal value.
96  ///
97  /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive
98  /// types. The underlying byte array stores the unscaled value encoded as two's
99  /// complement using big-endian byte order (the most significant byte is the
100  /// zeroth element). The value of the decimal is the value * 10^{-scale}.
101  ///
102  /// This must be accompanied by a (maximum) precision and a scale in the
103  /// SchemaElement. The precision specifies the number of digits in the decimal
104  /// and the scale stores the location of the decimal point. For example 1.23
105  /// would have precision 3 (3 total digits) and scale 2 (the decimal point is
106  /// 2 digits over).
107  DECIMAL = 5;
108
109  /// A date stored as days since Unix epoch, encoded as the INT32 physical type.
110  DATE = 6;
111
112  /// The total number of milliseconds since midnight. The value is stored as an INT32
113  /// physical type.
114  TIME_MILLIS = 7;
115
116  /// The total number of microseconds since midnight. The value is stored as an INT64
117  /// physical type.
118  TIME_MICROS = 8;
119
120  /// Date and time recorded as milliseconds since the Unix epoch.
121  /// Recorded as a physical type of INT64.
122  TIMESTAMP_MILLIS = 9;
123
124  /// Date and time recorded as microseconds since the Unix epoch.
125  /// The value is stored as an INT64 physical type.
126  TIMESTAMP_MICROS = 10;
127
128  /// An unsigned 8 bit integer value stored as INT32 physical type.
129  UINT_8 = 11;
130
131  /// An unsigned 16 bit integer value stored as INT32 physical type.
132  UINT_16 = 12;
133
134  /// An unsigned 32 bit integer value stored as INT32 physical type.
135  UINT_32 = 13;
136
137  /// An unsigned 64 bit integer value stored as INT64 physical type.
138  UINT_64 = 14;
139
140  /// A signed 8 bit integer value stored as INT32 physical type.
141  INT_8 = 15;
142
143  /// A signed 16 bit integer value stored as INT32 physical type.
144  INT_16 = 16;
145
146  /// A signed 32 bit integer value stored as INT32 physical type.
147  INT_32 = 17;
148
149  /// A signed 64 bit integer value stored as INT64 physical type.
150  INT_64 = 18;
151
152  /// A JSON document embedded within a single UTF8 column.
153  JSON = 19;
154
155   /// A BSON document embedded within a single BINARY column.
156  BSON = 20;
157
158  /// An interval of time
159  ///
160  /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12.
161  /// This data is composed of three separate little endian unsigned integers.
162  /// Each stores a component of a duration of time. The first integer identifies
163  /// the number of months associated with the duration, the second identifies
164  /// the number of days associated with the duration and the third identifies
165  /// the number of milliseconds associated with the provided duration.
166  /// This duration of time is independent of any particular timezone or date.
167  INTERVAL = 21;
168}
169);
170
171// ----------------------------------------------------------------------
172// Mirrors thrift union `TimeUnit`
173
174thrift_union_all_empty!(
175/// Time unit for `Time` and `Timestamp` logical types.
176union TimeUnit {
177  1: MilliSeconds MILLIS
178  2: MicroSeconds MICROS
179  3: NanoSeconds NANOS
180}
181);
182
183// ----------------------------------------------------------------------
184// Mirrors thrift union `LogicalType`
185
186// private structs for decoding logical type
187
188thrift_struct!(
189struct DecimalType {
190  1: required i32 scale
191  2: required i32 precision
192}
193);
194
195thrift_struct!(
196struct TimestampType {
197  1: required bool is_adjusted_to_u_t_c
198  2: required TimeUnit unit
199}
200);
201
202// they are identical
203use TimestampType as TimeType;
204
205thrift_struct!(
206struct IntType {
207  1: required i8 bit_width
208  2: required bool is_signed
209}
210);
211
212thrift_struct!(
213struct VariantType {
214  // The version of the variant specification that the variant was
215  // written with.
216  1: optional i8 specification_version
217}
218);
219
220thrift_struct!(
221struct GeometryType<'a> {
222  1: optional string<'a> crs;
223}
224);
225
226thrift_struct!(
227struct GeographyType<'a> {
228  1: optional string<'a> crs;
229  2: optional EdgeInterpolationAlgorithm algorithm;
230}
231);
232
233// TODO(ets): should we switch to tuple variants so we can use
234// the thrift macros?
235
236/// Logical types used by version 2.4.0+ of the Parquet format.
237///
238/// This is an *entirely new* struct as of version
239/// 4.0.0. The struct previously named `LogicalType` was renamed to
240/// [`ConvertedType`]. Please see the README.md for more details.
241#[derive(Debug, Clone, PartialEq, Eq)]
242pub enum LogicalType {
243    /// A UTF8 encoded string.
244    String,
245    /// A map of key-value pairs.
246    Map,
247    /// A list of elements.
248    List,
249    /// A set of predefined values.
250    Enum,
251    /// A decimal value with a specified scale and precision.
252    Decimal {
253        /// The number of digits in the decimal.
254        scale: i32,
255        /// The location of the decimal point.
256        precision: i32,
257    },
258    /// A date stored as days since Unix epoch.
259    Date,
260    /// A time stored as [`TimeUnit`] since midnight.
261    Time {
262        /// Whether the time is adjusted to UTC.
263        is_adjusted_to_u_t_c: bool,
264        /// The unit of time.
265        unit: TimeUnit,
266    },
267    /// A timestamp stored as [`TimeUnit`] since Unix epoch.
268    Timestamp {
269        /// Whether the timestamp is adjusted to UTC.
270        is_adjusted_to_u_t_c: bool,
271        /// The unit of time.
272        unit: TimeUnit,
273    },
274    /// An integer with a specified bit width and signedness.
275    Integer {
276        /// The number of bits in the integer.
277        bit_width: i8,
278        /// Whether the integer is signed.
279        is_signed: bool,
280    },
281    /// An unknown logical type.
282    Unknown,
283    /// A JSON document.
284    Json,
285    /// A BSON document.
286    Bson,
287    /// A UUID.
288    Uuid,
289    /// A 16-bit floating point number.
290    Float16,
291    /// A Variant value.
292    Variant {
293        /// The version of the variant specification that the variant was written with.
294        specification_version: Option<i8>,
295    },
296    /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation.
297    Geometry {
298        /// A custom CRS. If unset the defaults to `OGC:CRS84`, which means that the geometries
299        /// must be stored in longitude, latitude based on the WGS84 datum.
300        crs: Option<String>,
301    },
302    /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation.
303    Geography {
304        /// A custom CRS. If unset the defaults to `OGC:CRS84`.
305        crs: Option<String>,
306        /// An optional algorithm can be set to correctly interpret edges interpolation
307        /// of the geometries. If unset, the algorithm defaults to `SPHERICAL`.
308        algorithm: Option<EdgeInterpolationAlgorithm>,
309    },
310    /// For forward compatibility; used when an unknown union value is encountered.
311    _Unknown {
312        /// The field id encountered when parsing the unknown logical type.
313        field_id: i16,
314    },
315}
316
317impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {
318    fn read_thrift(prot: &mut R) -> Result<Self> {
319        let field_ident = prot.read_field_begin(0)?;
320        if field_ident.field_type == FieldType::Stop {
321            return Err(general_err!("received empty union from remote LogicalType"));
322        }
323        let ret = match field_ident.id {
324            1 => {
325                prot.skip_empty_struct()?;
326                Self::String
327            }
328            2 => {
329                prot.skip_empty_struct()?;
330                Self::Map
331            }
332            3 => {
333                prot.skip_empty_struct()?;
334                Self::List
335            }
336            4 => {
337                prot.skip_empty_struct()?;
338                Self::Enum
339            }
340            5 => {
341                let val = DecimalType::read_thrift(&mut *prot)?;
342                Self::Decimal {
343                    scale: val.scale,
344                    precision: val.precision,
345                }
346            }
347            6 => {
348                prot.skip_empty_struct()?;
349                Self::Date
350            }
351            7 => {
352                let val = TimeType::read_thrift(&mut *prot)?;
353                Self::Time {
354                    is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c,
355                    unit: val.unit,
356                }
357            }
358            8 => {
359                let val = TimestampType::read_thrift(&mut *prot)?;
360                Self::Timestamp {
361                    is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c,
362                    unit: val.unit,
363                }
364            }
365            10 => {
366                let val = IntType::read_thrift(&mut *prot)?;
367                Self::Integer {
368                    is_signed: val.is_signed,
369                    bit_width: val.bit_width,
370                }
371            }
372            11 => {
373                prot.skip_empty_struct()?;
374                Self::Unknown
375            }
376            12 => {
377                prot.skip_empty_struct()?;
378                Self::Json
379            }
380            13 => {
381                prot.skip_empty_struct()?;
382                Self::Bson
383            }
384            14 => {
385                prot.skip_empty_struct()?;
386                Self::Uuid
387            }
388            15 => {
389                prot.skip_empty_struct()?;
390                Self::Float16
391            }
392            16 => {
393                let val = VariantType::read_thrift(&mut *prot)?;
394                Self::Variant {
395                    specification_version: val.specification_version,
396                }
397            }
398            17 => {
399                let val = GeometryType::read_thrift(&mut *prot)?;
400                Self::Geometry {
401                    crs: val.crs.map(|s| s.to_owned()),
402                }
403            }
404            18 => {
405                let val = GeographyType::read_thrift(&mut *prot)?;
406                // unset algorithm means SPHERICAL, per the spec:
407                // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography
408                let algorithm = val
409                    .algorithm
410                    .unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL);
411                Self::Geography {
412                    crs: val.crs.map(|s| s.to_owned()),
413                    algorithm: Some(algorithm),
414                }
415            }
416            _ => {
417                prot.skip(field_ident.field_type)?;
418                Self::_Unknown {
419                    field_id: field_ident.id,
420                }
421            }
422        };
423        let field_ident = prot.read_field_begin(field_ident.id)?;
424        if field_ident.field_type != FieldType::Stop {
425            return Err(general_err!(
426                "Received multiple fields for union from remote LogicalType"
427            ));
428        }
429        Ok(ret)
430    }
431}
432
433impl WriteThrift for LogicalType {
434    const ELEMENT_TYPE: ElementType = ElementType::Struct;
435
436    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
437        match self {
438            Self::String => {
439                writer.write_empty_struct(1, 0)?;
440            }
441            Self::Map => {
442                writer.write_empty_struct(2, 0)?;
443            }
444            Self::List => {
445                writer.write_empty_struct(3, 0)?;
446            }
447            Self::Enum => {
448                writer.write_empty_struct(4, 0)?;
449            }
450            Self::Decimal { scale, precision } => {
451                DecimalType {
452                    scale: *scale,
453                    precision: *precision,
454                }
455                .write_thrift_field(writer, 5, 0)?;
456            }
457            Self::Date => {
458                writer.write_empty_struct(6, 0)?;
459            }
460            Self::Time {
461                is_adjusted_to_u_t_c,
462                unit,
463            } => {
464                TimeType {
465                    is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c,
466                    unit: *unit,
467                }
468                .write_thrift_field(writer, 7, 0)?;
469            }
470            Self::Timestamp {
471                is_adjusted_to_u_t_c,
472                unit,
473            } => {
474                TimestampType {
475                    is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c,
476                    unit: *unit,
477                }
478                .write_thrift_field(writer, 8, 0)?;
479            }
480            Self::Integer {
481                bit_width,
482                is_signed,
483            } => {
484                IntType {
485                    bit_width: *bit_width,
486                    is_signed: *is_signed,
487                }
488                .write_thrift_field(writer, 10, 0)?;
489            }
490            Self::Unknown => {
491                writer.write_empty_struct(11, 0)?;
492            }
493            Self::Json => {
494                writer.write_empty_struct(12, 0)?;
495            }
496            Self::Bson => {
497                writer.write_empty_struct(13, 0)?;
498            }
499            Self::Uuid => {
500                writer.write_empty_struct(14, 0)?;
501            }
502            Self::Float16 => {
503                writer.write_empty_struct(15, 0)?;
504            }
505            Self::Variant {
506                specification_version,
507            } => {
508                VariantType {
509                    specification_version: *specification_version,
510                }
511                .write_thrift_field(writer, 16, 0)?;
512            }
513            Self::Geometry { crs } => {
514                GeometryType {
515                    crs: crs.as_ref().map(|s| s.as_str()),
516                }
517                .write_thrift_field(writer, 17, 0)?;
518            }
519            Self::Geography { crs, algorithm } => {
520                GeographyType {
521                    crs: crs.as_ref().map(|s| s.as_str()),
522                    algorithm: *algorithm,
523                }
524                .write_thrift_field(writer, 18, 0)?;
525            }
526            _ => return Err(nyi_err!("logical type")),
527        }
528        writer.write_struct_end()
529    }
530}
531
532write_thrift_field!(LogicalType, FieldType::Struct);
533
534// ----------------------------------------------------------------------
535// Mirrors thrift enum `FieldRepetitionType`
536//
537
538thrift_enum!(
539/// Representation of field types in schema.
540enum FieldRepetitionType {
541  /// This field is required (can not be null) and each row has exactly 1 value.
542  REQUIRED = 0;
543  /// The field is optional (can be null) and each row has 0 or 1 values.
544  OPTIONAL = 1;
545  /// The field is repeated and can contain 0 or more values.
546  REPEATED = 2;
547}
548);
549
550/// Type alias for thrift `FieldRepetitionType`
551pub type Repetition = FieldRepetitionType;
552
553// ----------------------------------------------------------------------
554// Mirrors thrift enum `Encoding`
555
556thrift_enum!(
557/// Encodings supported by Parquet.
558///
559/// Not all encodings are valid for all types. These enums are also used to specify the
560/// encoding of definition and repetition levels.
561///
562/// By default this crate uses [Encoding::PLAIN], [Encoding::RLE], and [Encoding::RLE_DICTIONARY].
563/// These provide very good encode and decode performance, whilst yielding reasonable storage
564/// efficiency and being supported by all major parquet readers.
565///
566/// The delta encodings are also supported and will be used if a newer [WriterVersion] is
567/// configured, however, it should be noted that these sacrifice encode and decode performance for
568/// improved storage efficiency. This performance regression is particularly pronounced in the case
569/// of record skipping as occurs during predicate push-down. It is recommended users assess the
570/// performance impact when evaluating these encodings.
571///
572/// [WriterVersion]: crate::file::properties::WriterVersion
573enum Encoding {
574  /// Default encoding.
575  /// - BOOLEAN - 1 bit per value. 0 is false; 1 is true.
576  /// - INT32 - 4 bytes per value.  Stored as little-endian.
577  /// - INT64 - 8 bytes per value.  Stored as little-endian.
578  /// - FLOAT - 4 bytes per value.  IEEE. Stored as little-endian.
579  /// - DOUBLE - 8 bytes per value.  IEEE. Stored as little-endian.
580  /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
581  /// - FIXED_LEN_BYTE_ARRAY - Just the bytes.
582  PLAIN = 0;
583  //  GROUP_VAR_INT = 1;
584  /// **Deprecated** dictionary encoding.
585  ///
586  /// The values in the dictionary are encoded using PLAIN encoding.
587  /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and
588  /// PLAIN encoding is used for dictionary page.
589  PLAIN_DICTIONARY = 2;
590  /// Group packed run length encoding.
591  ///
592  /// Usable for definition/repetition levels encoding and boolean values.
593  RLE = 3;
594  /// **Deprecated** Bit-packed encoding.
595  ///
596  /// This can only be used if the data has a known max width.
597  /// Usable for definition/repetition levels encoding.
598  ///
599  /// There are compatibility issues with files using this encoding.
600  /// The parquet standard specifies the bits to be packed starting from the
601  /// most-significant bit, several implementations do not follow this bit order.
602  /// Several other implementations also have issues reading this encoding
603  /// because of incorrect assumptions about the length of the encoded data.
604  ///
605  /// The RLE/bit-packing hybrid is more cpu and memory efficient and should be used instead.
606  #[deprecated(
607      since = "51.0.0",
608      note = "Please see documentation for compatibility issues and use the RLE/bit-packing hybrid encoding instead"
609  )]
610  BIT_PACKED = 4;
611  /// Delta encoding for integers, either INT32 or INT64.
612  ///
613  /// Works best on sorted data.
614  DELTA_BINARY_PACKED = 5;
615  /// Encoding for byte arrays to separate the length values and the data.
616  ///
617  /// The lengths are encoded using DELTA_BINARY_PACKED encoding.
618  DELTA_LENGTH_BYTE_ARRAY = 6;
619  /// Incremental encoding for byte arrays.
620  ///
621  /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding.
622  /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding.
623  DELTA_BYTE_ARRAY = 7;
624  /// Dictionary encoding.
625  ///
626  /// The ids are encoded using the RLE encoding.
627  RLE_DICTIONARY = 8;
628  /// Encoding for fixed-width data.
629  ///
630  /// K byte-streams are created where K is the size in bytes of the data type.
631  /// The individual bytes of a value are scattered to the corresponding stream and
632  /// the streams are concatenated.
633  /// This itself does not reduce the size of the data but can lead to better compression
634  /// afterwards. Note that the use of this encoding with FIXED_LEN_BYTE_ARRAY(N) data may
635  /// perform poorly for large values of N.
636  BYTE_STREAM_SPLIT = 9;
637}
638);
639
640impl FromStr for Encoding {
641    type Err = ParquetError;
642
643    fn from_str(s: &str) -> Result<Self, Self::Err> {
644        match s {
645            "PLAIN" | "plain" => Ok(Encoding::PLAIN),
646            "PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY),
647            "RLE" | "rle" => Ok(Encoding::RLE),
648            #[allow(deprecated)]
649            "BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
650            "DELTA_BINARY_PACKED" | "delta_binary_packed" => Ok(Encoding::DELTA_BINARY_PACKED),
651            "DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
652                Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY)
653            }
654            "DELTA_BYTE_ARRAY" | "delta_byte_array" => Ok(Encoding::DELTA_BYTE_ARRAY),
655            "RLE_DICTIONARY" | "rle_dictionary" => Ok(Encoding::RLE_DICTIONARY),
656            "BYTE_STREAM_SPLIT" | "byte_stream_split" => Ok(Encoding::BYTE_STREAM_SPLIT),
657            _ => Err(general_err!("unknown encoding: {}", s)),
658        }
659    }
660}
661
662/// A bitmask representing the [`Encoding`]s employed while encoding a Parquet column chunk.
663///
664/// The Parquet [`ColumnMetaData`] struct contains an array that indicates what encodings were
665/// used when writing that column chunk. For memory and performance reasons, this crate reduces
666/// that array to bitmask, where each bit position represents a different [`Encoding`]. This
667/// struct contains that bitmask, and provides methods to interact with the data.
668///
669/// # Example
670/// ```no_run
671/// # use parquet::file::metadata::ParquetMetaDataReader;
672/// # use parquet::basic::Encoding;
673/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
674/// // read parquet metadata from a file
675/// let file = open_parquet_file("some_path.parquet");
676/// let mut reader = ParquetMetaDataReader::new();
677/// reader.try_parse(&file).unwrap();
678/// let metadata = reader.finish().unwrap();
679///
680/// // find the encodings used by the first column chunk in the first row group
681/// let col_meta = metadata.row_group(0).column(0);
682/// let encodings = col_meta.encodings_mask();
683///
684/// // check to see if a particular encoding was used
685/// let used_rle = encodings.is_set(Encoding::RLE);
686///
687/// // check to see if all of a set of encodings were used
688/// let used_all = encodings.all_set([Encoding::RLE, Encoding::PLAIN].iter());
689///
690/// // convert mask to a Vec<Encoding>
691/// let encodings_vec = encodings.encodings().collect::<Vec<_>>();
692/// ```
693///
694/// [`ColumnMetaData`]: https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/src/main/thrift/parquet.thrift#L875
695#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
696pub struct EncodingMask(i32);
697
698impl EncodingMask {
699    /// Highest valued discriminant in the [`Encoding`] enum
700    const MAX_ENCODING: i32 = Encoding::MAX_DISCRIMINANT;
701    /// A mask consisting of unused bit positions, used for validation. This includes the never
702    /// used GROUP_VAR_INT encoding value of `1`.
703    const ALLOWED_MASK: u32 =
704        !(1u32 << (EncodingMask::MAX_ENCODING as u32 + 1)).wrapping_sub(1) | 1 << 1;
705
706    /// Attempt to create a new `EncodingMask` from an integer.
707    ///
708    /// This will return an error if a bit outside the allowable range is set.
709    pub fn try_new(val: i32) -> Result<Self> {
710        if val as u32 & Self::ALLOWED_MASK != 0 {
711            return Err(general_err!("Attempt to create invalid mask: 0x{:x}", val));
712        }
713        Ok(Self(val))
714    }
715
716    /// Return an integer representation of this `EncodingMask`.
717    pub fn as_i32(&self) -> i32 {
718        self.0
719    }
720
721    /// Create a new `EncodingMask` from a collection of [`Encoding`]s.
722    pub fn new_from_encodings<'a>(encodings: impl Iterator<Item = &'a Encoding>) -> Self {
723        let mut mask = 0;
724        for &e in encodings {
725            mask |= 1 << (e as i32);
726        }
727        Self(mask)
728    }
729
730    /// Mark the given [`Encoding`] as present in this mask.
731    pub fn insert(&mut self, val: Encoding) {
732        self.0 |= 1 << (val as i32);
733    }
734
735    /// Test if a given [`Encoding`] is present in this mask.
736    pub fn is_set(&self, val: Encoding) -> bool {
737        self.0 & (1 << (val as i32)) != 0
738    }
739
740    /// Test if this mask has only the bit for the given [`Encoding`] set.
741    pub fn is_only(&self, val: Encoding) -> bool {
742        self.0 == (1 << (val as i32))
743    }
744
745    /// Test if all [`Encoding`]s in a given set are present in this mask.
746    pub fn all_set<'a>(&self, mut encodings: impl Iterator<Item = &'a Encoding>) -> bool {
747        encodings.all(|&e| self.is_set(e))
748    }
749
750    /// Return an iterator over all [`Encoding`]s present in this mask.
751    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
752        Self::mask_to_encodings_iter(self.0)
753    }
754
755    fn mask_to_encodings_iter(mask: i32) -> impl Iterator<Item = Encoding> {
756        (0..=Self::MAX_ENCODING)
757            .filter(move |i| mask & (1 << i) != 0)
758            .map(i32_to_encoding)
759    }
760}
761
762impl HeapSize for EncodingMask {
763    fn heap_size(&self) -> usize {
764        0 // no heap allocations
765    }
766}
767
768impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EncodingMask {
769    fn read_thrift(prot: &mut R) -> Result<Self> {
770        let mut mask = 0;
771
772        // This reads a Thrift `list<Encoding>` and turns it into a bitmask
773        let list_ident = prot.read_list_begin()?;
774        for _ in 0..list_ident.size {
775            let val = Encoding::read_thrift(prot)?;
776            mask |= 1 << val as i32;
777        }
778        Ok(Self(mask))
779    }
780}
781
782#[allow(deprecated)]
783fn i32_to_encoding(val: i32) -> Encoding {
784    match val {
785        0 => Encoding::PLAIN,
786        2 => Encoding::PLAIN_DICTIONARY,
787        3 => Encoding::RLE,
788        4 => Encoding::BIT_PACKED,
789        5 => Encoding::DELTA_BINARY_PACKED,
790        6 => Encoding::DELTA_LENGTH_BYTE_ARRAY,
791        7 => Encoding::DELTA_BYTE_ARRAY,
792        8 => Encoding::RLE_DICTIONARY,
793        9 => Encoding::BYTE_STREAM_SPLIT,
794        _ => panic!("Impossible encoding {val}"),
795    }
796}
797
798// ----------------------------------------------------------------------
799// Mirrors thrift enum `CompressionCodec`
800
801/// Supported block compression algorithms.
802///
803/// Block compression can yield non-trivial improvements to storage efficiency at the expense
804/// of potentially significantly worse encode and decode performance. Many applications,
805/// especially those making use of high-throughput and low-cost commodity object storage,
806/// may find storage efficiency less important than decode throughput, and therefore may
807/// wish to not make use of block compression.
808///
809/// The writers in this crate default to no block compression for this reason.
810///
811/// Applications that do still wish to use block compression, will find [`Compression::ZSTD`]
812/// to provide a good balance of compression, performance, and ecosystem support. Alternatively,
813/// [`Compression::LZ4_RAW`] provides much faster decompression speeds, at the cost of typically
814/// worse compression ratios. However, it is not as widely supported by the ecosystem, with the
815/// Hadoop ecosystem historically favoring the non-standard and now deprecated [`Compression::LZ4`].
816#[derive(Debug, Clone, Copy, PartialEq, Eq)]
817#[allow(non_camel_case_types)]
818pub enum Compression {
819    /// No compression.
820    UNCOMPRESSED,
821    /// [Snappy compression](https://en.wikipedia.org/wiki/Snappy_(compression))
822    SNAPPY,
823    /// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
824    GZIP(GzipLevel),
825    /// [LZO compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
826    LZO,
827    /// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
828    BROTLI(BrotliLevel),
829    /// [LZ4 compression](https://lz4.org/), [(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
830    LZ4,
831    /// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
832    ZSTD(ZstdLevel),
833    /// [LZ4 compression](https://lz4.org/).
834    LZ4_RAW,
835}
836
837impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for Compression {
838    fn read_thrift(prot: &mut R) -> Result<Self> {
839        let val = prot.read_i32()?;
840        Ok(match val {
841            0 => Self::UNCOMPRESSED,
842            1 => Self::SNAPPY,
843            2 => Self::GZIP(Default::default()),
844            3 => Self::LZO,
845            4 => Self::BROTLI(Default::default()),
846            5 => Self::LZ4,
847            6 => Self::ZSTD(Default::default()),
848            7 => Self::LZ4_RAW,
849            _ => return Err(general_err!("Unexpected CompressionCodec {}", val)),
850        })
851    }
852}
853
854// TODO(ets): explore replacing this with a thrift_enum!(ThriftCompression) for the serialization
855// and then provide `From` impls to convert back and forth. This is necessary due to the addition
856// of compression level to some variants.
857impl WriteThrift for Compression {
858    const ELEMENT_TYPE: ElementType = ElementType::I32;
859
860    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
861        let id: i32 = match *self {
862            Self::UNCOMPRESSED => 0,
863            Self::SNAPPY => 1,
864            Self::GZIP(_) => 2,
865            Self::LZO => 3,
866            Self::BROTLI(_) => 4,
867            Self::LZ4 => 5,
868            Self::ZSTD(_) => 6,
869            Self::LZ4_RAW => 7,
870        };
871        writer.write_i32(id)
872    }
873}
874
875write_thrift_field!(Compression, FieldType::I32);
876
877impl Compression {
878    /// Returns the codec type of this compression setting as a string, without the compression
879    /// level.
880    pub(crate) fn codec_to_string(self) -> String {
881        format!("{self:?}").split('(').next().unwrap().to_owned()
882    }
883}
884
885fn split_compression_string(str_setting: &str) -> Result<(&str, Option<u32>), ParquetError> {
886    let split_setting = str_setting.split_once('(');
887
888    match split_setting {
889        Some((codec, level_str)) => {
890            let level = &level_str[..level_str.len() - 1]
891                .parse::<u32>()
892                .map_err(|_| {
893                    ParquetError::General(format!("invalid compression level: {level_str}"))
894                })?;
895            Ok((codec, Some(*level)))
896        }
897        None => Ok((str_setting, None)),
898    }
899}
900
901fn check_level_is_none(level: &Option<u32>) -> Result<(), ParquetError> {
902    if level.is_some() {
903        return Err(ParquetError::General(
904            "compression level is not supported".to_string(),
905        ));
906    }
907
908    Ok(())
909}
910
911fn require_level(codec: &str, level: Option<u32>) -> Result<u32, ParquetError> {
912    level.ok_or(ParquetError::General(format!(
913        "{codec} requires a compression level",
914    )))
915}
916
917impl FromStr for Compression {
918    type Err = ParquetError;
919
920    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
921        let (codec, level) = split_compression_string(s)?;
922
923        let c = match codec {
924            "UNCOMPRESSED" | "uncompressed" => {
925                check_level_is_none(&level)?;
926                Compression::UNCOMPRESSED
927            }
928            "SNAPPY" | "snappy" => {
929                check_level_is_none(&level)?;
930                Compression::SNAPPY
931            }
932            "GZIP" | "gzip" => {
933                let level = require_level(codec, level)?;
934                Compression::GZIP(GzipLevel::try_new(level)?)
935            }
936            "LZO" | "lzo" => {
937                check_level_is_none(&level)?;
938                Compression::LZO
939            }
940            "BROTLI" | "brotli" => {
941                let level = require_level(codec, level)?;
942                Compression::BROTLI(BrotliLevel::try_new(level)?)
943            }
944            "LZ4" | "lz4" => {
945                check_level_is_none(&level)?;
946                Compression::LZ4
947            }
948            "ZSTD" | "zstd" => {
949                let level = require_level(codec, level)?;
950                Compression::ZSTD(ZstdLevel::try_new(level as i32)?)
951            }
952            "LZ4_RAW" | "lz4_raw" => {
953                check_level_is_none(&level)?;
954                Compression::LZ4_RAW
955            }
956            _ => {
957                return Err(ParquetError::General(format!(
958                    "unsupport compression {codec}"
959                )));
960            }
961        };
962
963        Ok(c)
964    }
965}
966
967// ----------------------------------------------------------------------
968// Mirrors thrift enum `PageType`
969
970thrift_enum!(
971/// Available data pages for Parquet file format.
972/// Note that some of the page types may not be supported.
973enum PageType {
974  DATA_PAGE = 0;
975  INDEX_PAGE = 1;
976  DICTIONARY_PAGE = 2;
977  DATA_PAGE_V2 = 3;
978}
979);
980
981// ----------------------------------------------------------------------
982// Mirrors thrift enum `BoundaryOrder`
983
984thrift_enum!(
985/// Enum to annotate whether lists of min/max elements inside ColumnIndex
986/// are ordered and if so, in which direction.
987enum BoundaryOrder {
988  UNORDERED = 0;
989  ASCENDING = 1;
990  DESCENDING = 2;
991}
992);
993
994// ----------------------------------------------------------------------
995// Mirrors thrift enum `EdgeInterpolationAlgorithm`
996
997// this is hand coded to allow for the _Unknown variant (allows this to be forward compatible)
998
999/// Edge interpolation algorithm for [`LogicalType::Geography`]
1000#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
1001#[repr(i32)]
1002#[derive(Default)]
1003pub enum EdgeInterpolationAlgorithm {
1004    /// Edges are interpolated as geodesics on a sphere.
1005    #[default]
1006    SPHERICAL = 0,
1007    /// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
1008    VINCENTY = 1,
1009    /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970
1010    THOMAS = 2,
1011    /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965.
1012    ANDOYER = 3,
1013    /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55
1014    KARNEY = 4,
1015    /// Unknown algorithm
1016    _Unknown(i32),
1017}
1018
1019impl fmt::Display for EdgeInterpolationAlgorithm {
1020    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1021        f.write_fmt(format_args!("{0:?}", self))
1022    }
1023}
1024
1025impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EdgeInterpolationAlgorithm {
1026    fn read_thrift(prot: &mut R) -> Result<Self> {
1027        let val = prot.read_i32()?;
1028        match val {
1029            0 => Ok(Self::SPHERICAL),
1030            1 => Ok(Self::VINCENTY),
1031            2 => Ok(Self::THOMAS),
1032            3 => Ok(Self::ANDOYER),
1033            4 => Ok(Self::KARNEY),
1034            _ => Ok(Self::_Unknown(val)),
1035        }
1036    }
1037}
1038
1039impl WriteThrift for EdgeInterpolationAlgorithm {
1040    const ELEMENT_TYPE: ElementType = ElementType::I32;
1041    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
1042        let val: i32 = match *self {
1043            Self::SPHERICAL => 0,
1044            Self::VINCENTY => 1,
1045            Self::THOMAS => 2,
1046            Self::ANDOYER => 3,
1047            Self::KARNEY => 4,
1048            Self::_Unknown(i) => i,
1049        };
1050        writer.write_i32(val)
1051    }
1052}
1053
1054write_thrift_field!(EdgeInterpolationAlgorithm, FieldType::I32);
1055
1056// ----------------------------------------------------------------------
1057// Mirrors thrift union `BloomFilterAlgorithm`
1058
1059thrift_union_all_empty!(
1060/// The algorithm used in Bloom filter.
1061union BloomFilterAlgorithm {
1062  /// Block-based Bloom filter.
1063  1: SplitBlockAlgorithm BLOCK;
1064}
1065);
1066
1067// ----------------------------------------------------------------------
1068// Mirrors thrift union `BloomFilterHash`
1069
1070thrift_union_all_empty!(
1071/// The hash function used in Bloom filter. This function takes the hash of a column value
1072/// using plain encoding.
1073union BloomFilterHash {
1074  /// xxHash Strategy.
1075  1: XxHash XXHASH;
1076}
1077);
1078
1079// ----------------------------------------------------------------------
1080// Mirrors thrift union `BloomFilterCompression`
1081
1082thrift_union_all_empty!(
1083/// The compression used in the Bloom filter.
1084union BloomFilterCompression {
1085  1: Uncompressed UNCOMPRESSED;
1086}
1087);
1088
1089// ----------------------------------------------------------------------
1090// Mirrors thrift union `ColumnOrder`
1091
1092/// Sort order for page and column statistics.
1093///
1094/// Types are associated with sort orders and column stats are aggregated using a sort
1095/// order, and a sort order should be considered when comparing values with statistics
1096/// min/max.
1097///
1098/// See reference in
1099/// <https://github.com/apache/arrow/blob/main/cpp/src/parquet/types.h>
1100#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1101#[allow(non_camel_case_types)]
1102pub enum SortOrder {
1103    /// Signed (either value or legacy byte-wise) comparison.
1104    SIGNED,
1105    /// Unsigned (depending on physical type either value or byte-wise) comparison.
1106    UNSIGNED,
1107    /// Comparison is undefined.
1108    UNDEFINED,
1109}
1110
1111impl SortOrder {
1112    /// Returns true if this is [`Self::SIGNED`]
1113    pub fn is_signed(&self) -> bool {
1114        matches!(self, Self::SIGNED)
1115    }
1116}
1117
1118/// Column order that specifies what method was used to aggregate min/max values for
1119/// statistics.
1120///
1121/// If column order is undefined, then it is the legacy behaviour and all values should
1122/// be compared as signed values/bytes.
1123#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1124#[allow(non_camel_case_types)]
1125pub enum ColumnOrder {
1126    /// Column uses the order defined by its logical or physical type
1127    /// (if there is no logical type), parquet-format 2.4.0+.
1128    TYPE_DEFINED_ORDER(SortOrder),
1129    // The following are not defined in the Parquet spec and should always be last.
1130    /// Undefined column order, means legacy behaviour before parquet-format 2.4.0.
1131    /// Sort order is always SIGNED.
1132    UNDEFINED,
1133    /// An unknown but present ColumnOrder. Statistics with an unknown `ColumnOrder`
1134    /// will be ignored.
1135    UNKNOWN,
1136}
1137
1138impl ColumnOrder {
1139    /// Returns sort order for a physical/logical type.
1140    #[deprecated(
1141        since = "57.1.0",
1142        note = "use `ColumnOrder::sort_order_for_type` instead"
1143    )]
1144    pub fn get_sort_order(
1145        logical_type: Option<LogicalType>,
1146        converted_type: ConvertedType,
1147        physical_type: Type,
1148    ) -> SortOrder {
1149        Self::sort_order_for_type(logical_type.as_ref(), converted_type, physical_type)
1150    }
1151
1152    /// Returns sort order for a physical/logical type.
1153    pub fn sort_order_for_type(
1154        logical_type: Option<&LogicalType>,
1155        converted_type: ConvertedType,
1156        physical_type: Type,
1157    ) -> SortOrder {
1158        match logical_type {
1159            Some(logical) => match logical {
1160                LogicalType::String | LogicalType::Enum | LogicalType::Json | LogicalType::Bson => {
1161                    SortOrder::UNSIGNED
1162                }
1163                LogicalType::Integer { is_signed, .. } => match is_signed {
1164                    true => SortOrder::SIGNED,
1165                    false => SortOrder::UNSIGNED,
1166                },
1167                LogicalType::Map | LogicalType::List => SortOrder::UNDEFINED,
1168                LogicalType::Decimal { .. } => SortOrder::SIGNED,
1169                LogicalType::Date => SortOrder::SIGNED,
1170                LogicalType::Time { .. } => SortOrder::SIGNED,
1171                LogicalType::Timestamp { .. } => SortOrder::SIGNED,
1172                LogicalType::Unknown => SortOrder::UNDEFINED,
1173                LogicalType::Uuid => SortOrder::UNSIGNED,
1174                LogicalType::Float16 => SortOrder::SIGNED,
1175                LogicalType::Variant { .. }
1176                | LogicalType::Geometry { .. }
1177                | LogicalType::Geography { .. }
1178                | LogicalType::_Unknown { .. } => SortOrder::UNDEFINED,
1179            },
1180            // Fall back to converted type
1181            None => Self::get_converted_sort_order(converted_type, physical_type),
1182        }
1183    }
1184
1185    fn get_converted_sort_order(converted_type: ConvertedType, physical_type: Type) -> SortOrder {
1186        match converted_type {
1187            // Unsigned byte-wise comparison.
1188            ConvertedType::UTF8
1189            | ConvertedType::JSON
1190            | ConvertedType::BSON
1191            | ConvertedType::ENUM => SortOrder::UNSIGNED,
1192
1193            ConvertedType::INT_8
1194            | ConvertedType::INT_16
1195            | ConvertedType::INT_32
1196            | ConvertedType::INT_64 => SortOrder::SIGNED,
1197
1198            ConvertedType::UINT_8
1199            | ConvertedType::UINT_16
1200            | ConvertedType::UINT_32
1201            | ConvertedType::UINT_64 => SortOrder::UNSIGNED,
1202
1203            // Signed comparison of the represented value.
1204            ConvertedType::DECIMAL => SortOrder::SIGNED,
1205
1206            ConvertedType::DATE => SortOrder::SIGNED,
1207
1208            ConvertedType::TIME_MILLIS
1209            | ConvertedType::TIME_MICROS
1210            | ConvertedType::TIMESTAMP_MILLIS
1211            | ConvertedType::TIMESTAMP_MICROS => SortOrder::SIGNED,
1212
1213            ConvertedType::INTERVAL => SortOrder::UNDEFINED,
1214
1215            ConvertedType::LIST | ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => {
1216                SortOrder::UNDEFINED
1217            }
1218
1219            // Fall back to physical type.
1220            ConvertedType::NONE => Self::get_default_sort_order(physical_type),
1221        }
1222    }
1223
1224    /// Returns default sort order based on physical type.
1225    fn get_default_sort_order(physical_type: Type) -> SortOrder {
1226        match physical_type {
1227            // Order: false, true
1228            Type::BOOLEAN => SortOrder::UNSIGNED,
1229            Type::INT32 | Type::INT64 => SortOrder::SIGNED,
1230            Type::INT96 => SortOrder::UNDEFINED,
1231            // Notes to remember when comparing float/double values:
1232            // If the min is a NaN, it should be ignored.
1233            // If the max is a NaN, it should be ignored.
1234            // If the min is +0, the row group may contain -0 values as well.
1235            // If the max is -0, the row group may contain +0 values as well.
1236            // When looking for NaN values, min and max should be ignored.
1237            Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED,
1238            // Unsigned byte-wise comparison
1239            Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED,
1240        }
1241    }
1242
1243    /// Returns sort order associated with this column order.
1244    pub fn sort_order(&self) -> SortOrder {
1245        match *self {
1246            ColumnOrder::TYPE_DEFINED_ORDER(order) => order,
1247            ColumnOrder::UNDEFINED => SortOrder::SIGNED,
1248            ColumnOrder::UNKNOWN => SortOrder::UNDEFINED,
1249        }
1250    }
1251}
1252
1253impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder {
1254    fn read_thrift(prot: &mut R) -> Result<Self> {
1255        let field_ident = prot.read_field_begin(0)?;
1256        if field_ident.field_type == FieldType::Stop {
1257            return Err(general_err!("Received empty union from remote ColumnOrder"));
1258        }
1259        let ret = match field_ident.id {
1260            1 => {
1261                // NOTE: the sort order needs to be set correctly after parsing.
1262                prot.skip_empty_struct()?;
1263                Self::TYPE_DEFINED_ORDER(SortOrder::SIGNED)
1264            }
1265            _ => {
1266                prot.skip(field_ident.field_type)?;
1267                Self::UNKNOWN
1268            }
1269        };
1270        let field_ident = prot.read_field_begin(field_ident.id)?;
1271        if field_ident.field_type != FieldType::Stop {
1272            return Err(general_err!(
1273                "Received multiple fields for union from remote ColumnOrder"
1274            ));
1275        }
1276        Ok(ret)
1277    }
1278}
1279
1280impl WriteThrift for ColumnOrder {
1281    const ELEMENT_TYPE: ElementType = ElementType::Struct;
1282
1283    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
1284        match *self {
1285            Self::TYPE_DEFINED_ORDER(_) => {
1286                writer.write_field_begin(FieldType::Struct, 1, 0)?;
1287                writer.write_struct_end()?;
1288            }
1289            _ => return Err(general_err!("Attempt to write undefined ColumnOrder")),
1290        }
1291        // write end of struct for this union
1292        writer.write_struct_end()
1293    }
1294}
1295
1296// ----------------------------------------------------------------------
1297// Display handlers
1298
1299impl fmt::Display for Compression {
1300    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1301        write!(f, "{self:?}")
1302    }
1303}
1304
1305impl fmt::Display for SortOrder {
1306    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1307        write!(f, "{self:?}")
1308    }
1309}
1310
1311impl fmt::Display for ColumnOrder {
1312    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1313        write!(f, "{self:?}")
1314    }
1315}
1316
1317// ----------------------------------------------------------------------
1318// LogicalType <=> ConvertedType conversion
1319
1320// Note: To prevent type loss when converting from ConvertedType to LogicalType,
1321// the conversion from ConvertedType -> LogicalType is not implemented.
1322// Such type loss includes:
1323// - Not knowing the decimal scale and precision of ConvertedType
1324// - Time and timestamp nanosecond precision, that is not supported in ConvertedType.
1325
1326impl From<Option<LogicalType>> for ConvertedType {
1327    fn from(value: Option<LogicalType>) -> Self {
1328        match value {
1329            Some(value) => match value {
1330                LogicalType::String => ConvertedType::UTF8,
1331                LogicalType::Map => ConvertedType::MAP,
1332                LogicalType::List => ConvertedType::LIST,
1333                LogicalType::Enum => ConvertedType::ENUM,
1334                LogicalType::Decimal { .. } => ConvertedType::DECIMAL,
1335                LogicalType::Date => ConvertedType::DATE,
1336                LogicalType::Time { unit, .. } => match unit {
1337                    TimeUnit::MILLIS => ConvertedType::TIME_MILLIS,
1338                    TimeUnit::MICROS => ConvertedType::TIME_MICROS,
1339                    TimeUnit::NANOS => ConvertedType::NONE,
1340                },
1341                LogicalType::Timestamp { unit, .. } => match unit {
1342                    TimeUnit::MILLIS => ConvertedType::TIMESTAMP_MILLIS,
1343                    TimeUnit::MICROS => ConvertedType::TIMESTAMP_MICROS,
1344                    TimeUnit::NANOS => ConvertedType::NONE,
1345                },
1346                LogicalType::Integer {
1347                    bit_width,
1348                    is_signed,
1349                } => match (bit_width, is_signed) {
1350                    (8, true) => ConvertedType::INT_8,
1351                    (16, true) => ConvertedType::INT_16,
1352                    (32, true) => ConvertedType::INT_32,
1353                    (64, true) => ConvertedType::INT_64,
1354                    (8, false) => ConvertedType::UINT_8,
1355                    (16, false) => ConvertedType::UINT_16,
1356                    (32, false) => ConvertedType::UINT_32,
1357                    (64, false) => ConvertedType::UINT_64,
1358                    (bit_width, is_signed) => panic!(
1359                        "Integer type bit_width={bit_width}, signed={is_signed} is not supported"
1360                    ),
1361                },
1362                LogicalType::Json => ConvertedType::JSON,
1363                LogicalType::Bson => ConvertedType::BSON,
1364                LogicalType::Uuid
1365                | LogicalType::Float16
1366                | LogicalType::Variant { .. }
1367                | LogicalType::Geometry { .. }
1368                | LogicalType::Geography { .. }
1369                | LogicalType::_Unknown { .. }
1370                | LogicalType::Unknown => ConvertedType::NONE,
1371            },
1372            None => ConvertedType::NONE,
1373        }
1374    }
1375}
1376
1377// ----------------------------------------------------------------------
1378// String conversions for schema parsing.
1379
1380impl str::FromStr for Repetition {
1381    type Err = ParquetError;
1382
1383    fn from_str(s: &str) -> Result<Self> {
1384        match s {
1385            "REQUIRED" => Ok(Repetition::REQUIRED),
1386            "OPTIONAL" => Ok(Repetition::OPTIONAL),
1387            "REPEATED" => Ok(Repetition::REPEATED),
1388            other => Err(general_err!("Invalid parquet repetition {}", other)),
1389        }
1390    }
1391}
1392
1393impl str::FromStr for Type {
1394    type Err = ParquetError;
1395
1396    fn from_str(s: &str) -> Result<Self> {
1397        match s {
1398            "BOOLEAN" => Ok(Type::BOOLEAN),
1399            "INT32" => Ok(Type::INT32),
1400            "INT64" => Ok(Type::INT64),
1401            "INT96" => Ok(Type::INT96),
1402            "FLOAT" => Ok(Type::FLOAT),
1403            "DOUBLE" => Ok(Type::DOUBLE),
1404            "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY),
1405            "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY),
1406            other => Err(general_err!("Invalid parquet type {}", other)),
1407        }
1408    }
1409}
1410
1411impl str::FromStr for ConvertedType {
1412    type Err = ParquetError;
1413
1414    fn from_str(s: &str) -> Result<Self> {
1415        match s {
1416            "NONE" => Ok(ConvertedType::NONE),
1417            "UTF8" => Ok(ConvertedType::UTF8),
1418            "MAP" => Ok(ConvertedType::MAP),
1419            "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE),
1420            "LIST" => Ok(ConvertedType::LIST),
1421            "ENUM" => Ok(ConvertedType::ENUM),
1422            "DECIMAL" => Ok(ConvertedType::DECIMAL),
1423            "DATE" => Ok(ConvertedType::DATE),
1424            "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS),
1425            "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS),
1426            "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS),
1427            "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS),
1428            "UINT_8" => Ok(ConvertedType::UINT_8),
1429            "UINT_16" => Ok(ConvertedType::UINT_16),
1430            "UINT_32" => Ok(ConvertedType::UINT_32),
1431            "UINT_64" => Ok(ConvertedType::UINT_64),
1432            "INT_8" => Ok(ConvertedType::INT_8),
1433            "INT_16" => Ok(ConvertedType::INT_16),
1434            "INT_32" => Ok(ConvertedType::INT_32),
1435            "INT_64" => Ok(ConvertedType::INT_64),
1436            "JSON" => Ok(ConvertedType::JSON),
1437            "BSON" => Ok(ConvertedType::BSON),
1438            "INTERVAL" => Ok(ConvertedType::INTERVAL),
1439            other => Err(general_err!("Invalid parquet converted type {}", other)),
1440        }
1441    }
1442}
1443
1444impl str::FromStr for LogicalType {
1445    type Err = ParquetError;
1446
1447    fn from_str(s: &str) -> Result<Self> {
1448        match s {
1449            // The type is a placeholder that gets updated elsewhere
1450            "INTEGER" => Ok(LogicalType::Integer {
1451                bit_width: 8,
1452                is_signed: false,
1453            }),
1454            "MAP" => Ok(LogicalType::Map),
1455            "LIST" => Ok(LogicalType::List),
1456            "ENUM" => Ok(LogicalType::Enum),
1457            "DECIMAL" => Ok(LogicalType::Decimal {
1458                precision: -1,
1459                scale: -1,
1460            }),
1461            "DATE" => Ok(LogicalType::Date),
1462            "TIME" => Ok(LogicalType::Time {
1463                is_adjusted_to_u_t_c: false,
1464                unit: TimeUnit::MILLIS,
1465            }),
1466            "TIMESTAMP" => Ok(LogicalType::Timestamp {
1467                is_adjusted_to_u_t_c: false,
1468                unit: TimeUnit::MILLIS,
1469            }),
1470            "STRING" => Ok(LogicalType::String),
1471            "JSON" => Ok(LogicalType::Json),
1472            "BSON" => Ok(LogicalType::Bson),
1473            "UUID" => Ok(LogicalType::Uuid),
1474            "UNKNOWN" => Ok(LogicalType::Unknown),
1475            "INTERVAL" => Err(general_err!(
1476                "Interval parquet logical type not yet supported"
1477            )),
1478            "FLOAT16" => Ok(LogicalType::Float16),
1479            "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }),
1480            "GEOGRAPHY" => Ok(LogicalType::Geography {
1481                crs: None,
1482                algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
1483            }),
1484            other => Err(general_err!("Invalid parquet logical type {}", other)),
1485        }
1486    }
1487}
1488
1489#[cfg(test)]
1490#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
1491mod tests {
1492    use super::*;
1493    use crate::parquet_thrift::{ThriftSliceInputProtocol, tests::test_roundtrip};
1494
1495    #[test]
1496    fn test_display_type() {
1497        assert_eq!(Type::BOOLEAN.to_string(), "BOOLEAN");
1498        assert_eq!(Type::INT32.to_string(), "INT32");
1499        assert_eq!(Type::INT64.to_string(), "INT64");
1500        assert_eq!(Type::INT96.to_string(), "INT96");
1501        assert_eq!(Type::FLOAT.to_string(), "FLOAT");
1502        assert_eq!(Type::DOUBLE.to_string(), "DOUBLE");
1503        assert_eq!(Type::BYTE_ARRAY.to_string(), "BYTE_ARRAY");
1504        assert_eq!(
1505            Type::FIXED_LEN_BYTE_ARRAY.to_string(),
1506            "FIXED_LEN_BYTE_ARRAY"
1507        );
1508    }
1509
1510    #[test]
1511    fn test_from_string_into_type() {
1512        assert_eq!(
1513            Type::BOOLEAN.to_string().parse::<Type>().unwrap(),
1514            Type::BOOLEAN
1515        );
1516        assert_eq!(
1517            Type::INT32.to_string().parse::<Type>().unwrap(),
1518            Type::INT32
1519        );
1520        assert_eq!(
1521            Type::INT64.to_string().parse::<Type>().unwrap(),
1522            Type::INT64
1523        );
1524        assert_eq!(
1525            Type::INT96.to_string().parse::<Type>().unwrap(),
1526            Type::INT96
1527        );
1528        assert_eq!(
1529            Type::FLOAT.to_string().parse::<Type>().unwrap(),
1530            Type::FLOAT
1531        );
1532        assert_eq!(
1533            Type::DOUBLE.to_string().parse::<Type>().unwrap(),
1534            Type::DOUBLE
1535        );
1536        assert_eq!(
1537            Type::BYTE_ARRAY.to_string().parse::<Type>().unwrap(),
1538            Type::BYTE_ARRAY
1539        );
1540        assert_eq!("BINARY".parse::<Type>().unwrap(), Type::BYTE_ARRAY);
1541        assert_eq!(
1542            Type::FIXED_LEN_BYTE_ARRAY
1543                .to_string()
1544                .parse::<Type>()
1545                .unwrap(),
1546            Type::FIXED_LEN_BYTE_ARRAY
1547        );
1548    }
1549
1550    #[test]
1551    fn test_converted_type_roundtrip() {
1552        test_roundtrip(ConvertedType::UTF8);
1553        test_roundtrip(ConvertedType::MAP);
1554        test_roundtrip(ConvertedType::MAP_KEY_VALUE);
1555        test_roundtrip(ConvertedType::LIST);
1556        test_roundtrip(ConvertedType::ENUM);
1557        test_roundtrip(ConvertedType::DECIMAL);
1558        test_roundtrip(ConvertedType::DATE);
1559        test_roundtrip(ConvertedType::TIME_MILLIS);
1560        test_roundtrip(ConvertedType::TIME_MICROS);
1561        test_roundtrip(ConvertedType::TIMESTAMP_MILLIS);
1562        test_roundtrip(ConvertedType::TIMESTAMP_MICROS);
1563        test_roundtrip(ConvertedType::UINT_8);
1564        test_roundtrip(ConvertedType::UINT_16);
1565        test_roundtrip(ConvertedType::UINT_32);
1566        test_roundtrip(ConvertedType::UINT_64);
1567        test_roundtrip(ConvertedType::INT_8);
1568        test_roundtrip(ConvertedType::INT_16);
1569        test_roundtrip(ConvertedType::INT_32);
1570        test_roundtrip(ConvertedType::INT_64);
1571        test_roundtrip(ConvertedType::JSON);
1572        test_roundtrip(ConvertedType::BSON);
1573        test_roundtrip(ConvertedType::INTERVAL);
1574    }
1575
1576    #[test]
1577    fn test_read_invalid_converted_type() {
1578        let mut prot = ThriftSliceInputProtocol::new(&[0x7eu8]);
1579        let res = ConvertedType::read_thrift(&mut prot);
1580        assert!(res.is_err());
1581        assert_eq!(
1582            res.unwrap_err().to_string(),
1583            "Parquet error: Unexpected ConvertedType 63"
1584        );
1585    }
1586
1587    #[test]
1588    fn test_display_converted_type() {
1589        assert_eq!(ConvertedType::NONE.to_string(), "NONE");
1590        assert_eq!(ConvertedType::UTF8.to_string(), "UTF8");
1591        assert_eq!(ConvertedType::MAP.to_string(), "MAP");
1592        assert_eq!(ConvertedType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE");
1593        assert_eq!(ConvertedType::LIST.to_string(), "LIST");
1594        assert_eq!(ConvertedType::ENUM.to_string(), "ENUM");
1595        assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL");
1596        assert_eq!(ConvertedType::DATE.to_string(), "DATE");
1597        assert_eq!(ConvertedType::TIME_MILLIS.to_string(), "TIME_MILLIS");
1598        assert_eq!(ConvertedType::DATE.to_string(), "DATE");
1599        assert_eq!(ConvertedType::TIME_MICROS.to_string(), "TIME_MICROS");
1600        assert_eq!(
1601            ConvertedType::TIMESTAMP_MILLIS.to_string(),
1602            "TIMESTAMP_MILLIS"
1603        );
1604        assert_eq!(
1605            ConvertedType::TIMESTAMP_MICROS.to_string(),
1606            "TIMESTAMP_MICROS"
1607        );
1608        assert_eq!(ConvertedType::UINT_8.to_string(), "UINT_8");
1609        assert_eq!(ConvertedType::UINT_16.to_string(), "UINT_16");
1610        assert_eq!(ConvertedType::UINT_32.to_string(), "UINT_32");
1611        assert_eq!(ConvertedType::UINT_64.to_string(), "UINT_64");
1612        assert_eq!(ConvertedType::INT_8.to_string(), "INT_8");
1613        assert_eq!(ConvertedType::INT_16.to_string(), "INT_16");
1614        assert_eq!(ConvertedType::INT_32.to_string(), "INT_32");
1615        assert_eq!(ConvertedType::INT_64.to_string(), "INT_64");
1616        assert_eq!(ConvertedType::JSON.to_string(), "JSON");
1617        assert_eq!(ConvertedType::BSON.to_string(), "BSON");
1618        assert_eq!(ConvertedType::INTERVAL.to_string(), "INTERVAL");
1619        assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL")
1620    }
1621
1622    #[test]
1623    fn test_from_string_into_converted_type() {
1624        assert_eq!(
1625            ConvertedType::NONE
1626                .to_string()
1627                .parse::<ConvertedType>()
1628                .unwrap(),
1629            ConvertedType::NONE
1630        );
1631        assert_eq!(
1632            ConvertedType::UTF8
1633                .to_string()
1634                .parse::<ConvertedType>()
1635                .unwrap(),
1636            ConvertedType::UTF8
1637        );
1638        assert_eq!(
1639            ConvertedType::MAP
1640                .to_string()
1641                .parse::<ConvertedType>()
1642                .unwrap(),
1643            ConvertedType::MAP
1644        );
1645        assert_eq!(
1646            ConvertedType::MAP_KEY_VALUE
1647                .to_string()
1648                .parse::<ConvertedType>()
1649                .unwrap(),
1650            ConvertedType::MAP_KEY_VALUE
1651        );
1652        assert_eq!(
1653            ConvertedType::LIST
1654                .to_string()
1655                .parse::<ConvertedType>()
1656                .unwrap(),
1657            ConvertedType::LIST
1658        );
1659        assert_eq!(
1660            ConvertedType::ENUM
1661                .to_string()
1662                .parse::<ConvertedType>()
1663                .unwrap(),
1664            ConvertedType::ENUM
1665        );
1666        assert_eq!(
1667            ConvertedType::DECIMAL
1668                .to_string()
1669                .parse::<ConvertedType>()
1670                .unwrap(),
1671            ConvertedType::DECIMAL
1672        );
1673        assert_eq!(
1674            ConvertedType::DATE
1675                .to_string()
1676                .parse::<ConvertedType>()
1677                .unwrap(),
1678            ConvertedType::DATE
1679        );
1680        assert_eq!(
1681            ConvertedType::TIME_MILLIS
1682                .to_string()
1683                .parse::<ConvertedType>()
1684                .unwrap(),
1685            ConvertedType::TIME_MILLIS
1686        );
1687        assert_eq!(
1688            ConvertedType::TIME_MICROS
1689                .to_string()
1690                .parse::<ConvertedType>()
1691                .unwrap(),
1692            ConvertedType::TIME_MICROS
1693        );
1694        assert_eq!(
1695            ConvertedType::TIMESTAMP_MILLIS
1696                .to_string()
1697                .parse::<ConvertedType>()
1698                .unwrap(),
1699            ConvertedType::TIMESTAMP_MILLIS
1700        );
1701        assert_eq!(
1702            ConvertedType::TIMESTAMP_MICROS
1703                .to_string()
1704                .parse::<ConvertedType>()
1705                .unwrap(),
1706            ConvertedType::TIMESTAMP_MICROS
1707        );
1708        assert_eq!(
1709            ConvertedType::UINT_8
1710                .to_string()
1711                .parse::<ConvertedType>()
1712                .unwrap(),
1713            ConvertedType::UINT_8
1714        );
1715        assert_eq!(
1716            ConvertedType::UINT_16
1717                .to_string()
1718                .parse::<ConvertedType>()
1719                .unwrap(),
1720            ConvertedType::UINT_16
1721        );
1722        assert_eq!(
1723            ConvertedType::UINT_32
1724                .to_string()
1725                .parse::<ConvertedType>()
1726                .unwrap(),
1727            ConvertedType::UINT_32
1728        );
1729        assert_eq!(
1730            ConvertedType::UINT_64
1731                .to_string()
1732                .parse::<ConvertedType>()
1733                .unwrap(),
1734            ConvertedType::UINT_64
1735        );
1736        assert_eq!(
1737            ConvertedType::INT_8
1738                .to_string()
1739                .parse::<ConvertedType>()
1740                .unwrap(),
1741            ConvertedType::INT_8
1742        );
1743        assert_eq!(
1744            ConvertedType::INT_16
1745                .to_string()
1746                .parse::<ConvertedType>()
1747                .unwrap(),
1748            ConvertedType::INT_16
1749        );
1750        assert_eq!(
1751            ConvertedType::INT_32
1752                .to_string()
1753                .parse::<ConvertedType>()
1754                .unwrap(),
1755            ConvertedType::INT_32
1756        );
1757        assert_eq!(
1758            ConvertedType::INT_64
1759                .to_string()
1760                .parse::<ConvertedType>()
1761                .unwrap(),
1762            ConvertedType::INT_64
1763        );
1764        assert_eq!(
1765            ConvertedType::JSON
1766                .to_string()
1767                .parse::<ConvertedType>()
1768                .unwrap(),
1769            ConvertedType::JSON
1770        );
1771        assert_eq!(
1772            ConvertedType::BSON
1773                .to_string()
1774                .parse::<ConvertedType>()
1775                .unwrap(),
1776            ConvertedType::BSON
1777        );
1778        assert_eq!(
1779            ConvertedType::INTERVAL
1780                .to_string()
1781                .parse::<ConvertedType>()
1782                .unwrap(),
1783            ConvertedType::INTERVAL
1784        );
1785        assert_eq!(
1786            ConvertedType::DECIMAL
1787                .to_string()
1788                .parse::<ConvertedType>()
1789                .unwrap(),
1790            ConvertedType::DECIMAL
1791        )
1792    }
1793
1794    #[test]
1795    fn test_logical_to_converted_type() {
1796        let logical_none: Option<LogicalType> = None;
1797        assert_eq!(ConvertedType::from(logical_none), ConvertedType::NONE);
1798        assert_eq!(
1799            ConvertedType::from(Some(LogicalType::Decimal {
1800                precision: 20,
1801                scale: 5
1802            })),
1803            ConvertedType::DECIMAL
1804        );
1805        assert_eq!(
1806            ConvertedType::from(Some(LogicalType::Bson)),
1807            ConvertedType::BSON
1808        );
1809        assert_eq!(
1810            ConvertedType::from(Some(LogicalType::Json)),
1811            ConvertedType::JSON
1812        );
1813        assert_eq!(
1814            ConvertedType::from(Some(LogicalType::String)),
1815            ConvertedType::UTF8
1816        );
1817        assert_eq!(
1818            ConvertedType::from(Some(LogicalType::Date)),
1819            ConvertedType::DATE
1820        );
1821        assert_eq!(
1822            ConvertedType::from(Some(LogicalType::Time {
1823                unit: TimeUnit::MILLIS,
1824                is_adjusted_to_u_t_c: true,
1825            })),
1826            ConvertedType::TIME_MILLIS
1827        );
1828        assert_eq!(
1829            ConvertedType::from(Some(LogicalType::Time {
1830                unit: TimeUnit::MICROS,
1831                is_adjusted_to_u_t_c: true,
1832            })),
1833            ConvertedType::TIME_MICROS
1834        );
1835        assert_eq!(
1836            ConvertedType::from(Some(LogicalType::Time {
1837                unit: TimeUnit::NANOS,
1838                is_adjusted_to_u_t_c: false,
1839            })),
1840            ConvertedType::NONE
1841        );
1842        assert_eq!(
1843            ConvertedType::from(Some(LogicalType::Timestamp {
1844                unit: TimeUnit::MILLIS,
1845                is_adjusted_to_u_t_c: true,
1846            })),
1847            ConvertedType::TIMESTAMP_MILLIS
1848        );
1849        assert_eq!(
1850            ConvertedType::from(Some(LogicalType::Timestamp {
1851                unit: TimeUnit::MICROS,
1852                is_adjusted_to_u_t_c: false,
1853            })),
1854            ConvertedType::TIMESTAMP_MICROS
1855        );
1856        assert_eq!(
1857            ConvertedType::from(Some(LogicalType::Timestamp {
1858                unit: TimeUnit::NANOS,
1859                is_adjusted_to_u_t_c: false,
1860            })),
1861            ConvertedType::NONE
1862        );
1863        assert_eq!(
1864            ConvertedType::from(Some(LogicalType::Integer {
1865                bit_width: 8,
1866                is_signed: false
1867            })),
1868            ConvertedType::UINT_8
1869        );
1870        assert_eq!(
1871            ConvertedType::from(Some(LogicalType::Integer {
1872                bit_width: 8,
1873                is_signed: true
1874            })),
1875            ConvertedType::INT_8
1876        );
1877        assert_eq!(
1878            ConvertedType::from(Some(LogicalType::Integer {
1879                bit_width: 16,
1880                is_signed: false
1881            })),
1882            ConvertedType::UINT_16
1883        );
1884        assert_eq!(
1885            ConvertedType::from(Some(LogicalType::Integer {
1886                bit_width: 16,
1887                is_signed: true
1888            })),
1889            ConvertedType::INT_16
1890        );
1891        assert_eq!(
1892            ConvertedType::from(Some(LogicalType::Integer {
1893                bit_width: 32,
1894                is_signed: false
1895            })),
1896            ConvertedType::UINT_32
1897        );
1898        assert_eq!(
1899            ConvertedType::from(Some(LogicalType::Integer {
1900                bit_width: 32,
1901                is_signed: true
1902            })),
1903            ConvertedType::INT_32
1904        );
1905        assert_eq!(
1906            ConvertedType::from(Some(LogicalType::Integer {
1907                bit_width: 64,
1908                is_signed: false
1909            })),
1910            ConvertedType::UINT_64
1911        );
1912        assert_eq!(
1913            ConvertedType::from(Some(LogicalType::Integer {
1914                bit_width: 64,
1915                is_signed: true
1916            })),
1917            ConvertedType::INT_64
1918        );
1919        assert_eq!(
1920            ConvertedType::from(Some(LogicalType::List)),
1921            ConvertedType::LIST
1922        );
1923        assert_eq!(
1924            ConvertedType::from(Some(LogicalType::Map)),
1925            ConvertedType::MAP
1926        );
1927        assert_eq!(
1928            ConvertedType::from(Some(LogicalType::Uuid)),
1929            ConvertedType::NONE
1930        );
1931        assert_eq!(
1932            ConvertedType::from(Some(LogicalType::Enum)),
1933            ConvertedType::ENUM
1934        );
1935        assert_eq!(
1936            ConvertedType::from(Some(LogicalType::Float16)),
1937            ConvertedType::NONE
1938        );
1939        assert_eq!(
1940            ConvertedType::from(Some(LogicalType::Geometry { crs: None })),
1941            ConvertedType::NONE
1942        );
1943        assert_eq!(
1944            ConvertedType::from(Some(LogicalType::Geography {
1945                crs: None,
1946                algorithm: Some(EdgeInterpolationAlgorithm::default()),
1947            })),
1948            ConvertedType::NONE
1949        );
1950        assert_eq!(
1951            ConvertedType::from(Some(LogicalType::Unknown)),
1952            ConvertedType::NONE
1953        );
1954    }
1955
1956    #[test]
1957    fn test_logical_type_roundtrip() {
1958        test_roundtrip(LogicalType::String);
1959        test_roundtrip(LogicalType::Map);
1960        test_roundtrip(LogicalType::List);
1961        test_roundtrip(LogicalType::Enum);
1962        test_roundtrip(LogicalType::Decimal {
1963            scale: 0,
1964            precision: 20,
1965        });
1966        test_roundtrip(LogicalType::Date);
1967        test_roundtrip(LogicalType::Time {
1968            is_adjusted_to_u_t_c: true,
1969            unit: TimeUnit::MICROS,
1970        });
1971        test_roundtrip(LogicalType::Time {
1972            is_adjusted_to_u_t_c: false,
1973            unit: TimeUnit::MILLIS,
1974        });
1975        test_roundtrip(LogicalType::Time {
1976            is_adjusted_to_u_t_c: false,
1977            unit: TimeUnit::NANOS,
1978        });
1979        test_roundtrip(LogicalType::Timestamp {
1980            is_adjusted_to_u_t_c: false,
1981            unit: TimeUnit::MICROS,
1982        });
1983        test_roundtrip(LogicalType::Timestamp {
1984            is_adjusted_to_u_t_c: true,
1985            unit: TimeUnit::MILLIS,
1986        });
1987        test_roundtrip(LogicalType::Timestamp {
1988            is_adjusted_to_u_t_c: true,
1989            unit: TimeUnit::NANOS,
1990        });
1991        test_roundtrip(LogicalType::Integer {
1992            bit_width: 8,
1993            is_signed: true,
1994        });
1995        test_roundtrip(LogicalType::Integer {
1996            bit_width: 16,
1997            is_signed: false,
1998        });
1999        test_roundtrip(LogicalType::Integer {
2000            bit_width: 32,
2001            is_signed: true,
2002        });
2003        test_roundtrip(LogicalType::Integer {
2004            bit_width: 64,
2005            is_signed: false,
2006        });
2007        test_roundtrip(LogicalType::Json);
2008        test_roundtrip(LogicalType::Bson);
2009        test_roundtrip(LogicalType::Uuid);
2010        test_roundtrip(LogicalType::Float16);
2011        test_roundtrip(LogicalType::Variant {
2012            specification_version: Some(1),
2013        });
2014        test_roundtrip(LogicalType::Variant {
2015            specification_version: None,
2016        });
2017        test_roundtrip(LogicalType::Geometry {
2018            crs: Some("foo".to_owned()),
2019        });
2020        test_roundtrip(LogicalType::Geometry { crs: None });
2021        test_roundtrip(LogicalType::Geography {
2022            crs: Some("foo".to_owned()),
2023            algorithm: Some(EdgeInterpolationAlgorithm::ANDOYER),
2024        });
2025        test_roundtrip(LogicalType::Geography {
2026            crs: None,
2027            algorithm: Some(EdgeInterpolationAlgorithm::KARNEY),
2028        });
2029        test_roundtrip(LogicalType::Geography {
2030            crs: Some("foo".to_owned()),
2031            algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
2032        });
2033        test_roundtrip(LogicalType::Geography {
2034            crs: None,
2035            algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
2036        });
2037    }
2038
2039    #[test]
2040    fn test_display_repetition() {
2041        assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED");
2042        assert_eq!(Repetition::OPTIONAL.to_string(), "OPTIONAL");
2043        assert_eq!(Repetition::REPEATED.to_string(), "REPEATED");
2044    }
2045
2046    #[test]
2047    fn test_from_string_into_repetition() {
2048        assert_eq!(
2049            Repetition::REQUIRED
2050                .to_string()
2051                .parse::<Repetition>()
2052                .unwrap(),
2053            Repetition::REQUIRED
2054        );
2055        assert_eq!(
2056            Repetition::OPTIONAL
2057                .to_string()
2058                .parse::<Repetition>()
2059                .unwrap(),
2060            Repetition::OPTIONAL
2061        );
2062        assert_eq!(
2063            Repetition::REPEATED
2064                .to_string()
2065                .parse::<Repetition>()
2066                .unwrap(),
2067            Repetition::REPEATED
2068        );
2069    }
2070
2071    #[test]
2072    fn test_display_encoding() {
2073        assert_eq!(Encoding::PLAIN.to_string(), "PLAIN");
2074        assert_eq!(Encoding::PLAIN_DICTIONARY.to_string(), "PLAIN_DICTIONARY");
2075        assert_eq!(Encoding::RLE.to_string(), "RLE");
2076        assert_eq!(Encoding::BIT_PACKED.to_string(), "BIT_PACKED");
2077        assert_eq!(
2078            Encoding::DELTA_BINARY_PACKED.to_string(),
2079            "DELTA_BINARY_PACKED"
2080        );
2081        assert_eq!(
2082            Encoding::DELTA_LENGTH_BYTE_ARRAY.to_string(),
2083            "DELTA_LENGTH_BYTE_ARRAY"
2084        );
2085        assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY");
2086        assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY");
2087    }
2088
2089    #[test]
2090    fn test_compression_codec_to_string() {
2091        assert_eq!(Compression::UNCOMPRESSED.codec_to_string(), "UNCOMPRESSED");
2092        assert_eq!(
2093            Compression::ZSTD(ZstdLevel::default()).codec_to_string(),
2094            "ZSTD"
2095        );
2096    }
2097
2098    #[test]
2099    fn test_display_compression() {
2100        assert_eq!(Compression::UNCOMPRESSED.to_string(), "UNCOMPRESSED");
2101        assert_eq!(Compression::SNAPPY.to_string(), "SNAPPY");
2102        assert_eq!(
2103            Compression::GZIP(Default::default()).to_string(),
2104            "GZIP(GzipLevel(6))"
2105        );
2106        assert_eq!(Compression::LZO.to_string(), "LZO");
2107        assert_eq!(
2108            Compression::BROTLI(Default::default()).to_string(),
2109            "BROTLI(BrotliLevel(1))"
2110        );
2111        assert_eq!(Compression::LZ4.to_string(), "LZ4");
2112        assert_eq!(
2113            Compression::ZSTD(Default::default()).to_string(),
2114            "ZSTD(ZstdLevel(1))"
2115        );
2116    }
2117
2118    #[test]
2119    fn test_display_page_type() {
2120        assert_eq!(PageType::DATA_PAGE.to_string(), "DATA_PAGE");
2121        assert_eq!(PageType::INDEX_PAGE.to_string(), "INDEX_PAGE");
2122        assert_eq!(PageType::DICTIONARY_PAGE.to_string(), "DICTIONARY_PAGE");
2123        assert_eq!(PageType::DATA_PAGE_V2.to_string(), "DATA_PAGE_V2");
2124    }
2125
2126    #[test]
2127    fn test_display_sort_order() {
2128        assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED");
2129        assert_eq!(SortOrder::UNSIGNED.to_string(), "UNSIGNED");
2130        assert_eq!(SortOrder::UNDEFINED.to_string(), "UNDEFINED");
2131    }
2132
2133    #[test]
2134    fn test_display_column_order() {
2135        assert_eq!(
2136            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).to_string(),
2137            "TYPE_DEFINED_ORDER(SIGNED)"
2138        );
2139        assert_eq!(
2140            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).to_string(),
2141            "TYPE_DEFINED_ORDER(UNSIGNED)"
2142        );
2143        assert_eq!(
2144            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).to_string(),
2145            "TYPE_DEFINED_ORDER(UNDEFINED)"
2146        );
2147        assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED");
2148    }
2149
2150    #[test]
2151    fn test_column_order_roundtrip() {
2152        // SortOrder::SIGNED is the default on read.
2153        test_roundtrip(ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED))
2154    }
2155
2156    #[test]
2157    fn test_column_order_get_logical_type_sort_order() {
2158        // Helper to check the order in a list of values.
2159        // Only logical type is checked.
2160        fn check_sort_order(types: Vec<LogicalType>, expected_order: SortOrder) {
2161            for tpe in types {
2162                assert_eq!(
2163                    ColumnOrder::get_sort_order(Some(tpe), ConvertedType::NONE, Type::BYTE_ARRAY),
2164                    expected_order
2165                );
2166            }
2167        }
2168
2169        // Unsigned comparison (physical type does not matter)
2170        let unsigned = vec![
2171            LogicalType::String,
2172            LogicalType::Json,
2173            LogicalType::Bson,
2174            LogicalType::Enum,
2175            LogicalType::Uuid,
2176            LogicalType::Integer {
2177                bit_width: 8,
2178                is_signed: false,
2179            },
2180            LogicalType::Integer {
2181                bit_width: 16,
2182                is_signed: false,
2183            },
2184            LogicalType::Integer {
2185                bit_width: 32,
2186                is_signed: false,
2187            },
2188            LogicalType::Integer {
2189                bit_width: 64,
2190                is_signed: false,
2191            },
2192        ];
2193        check_sort_order(unsigned, SortOrder::UNSIGNED);
2194
2195        // Signed comparison (physical type does not matter)
2196        let signed = vec![
2197            LogicalType::Integer {
2198                bit_width: 8,
2199                is_signed: true,
2200            },
2201            LogicalType::Integer {
2202                bit_width: 8,
2203                is_signed: true,
2204            },
2205            LogicalType::Integer {
2206                bit_width: 8,
2207                is_signed: true,
2208            },
2209            LogicalType::Integer {
2210                bit_width: 8,
2211                is_signed: true,
2212            },
2213            LogicalType::Decimal {
2214                scale: 20,
2215                precision: 4,
2216            },
2217            LogicalType::Date,
2218            LogicalType::Time {
2219                is_adjusted_to_u_t_c: false,
2220                unit: TimeUnit::MILLIS,
2221            },
2222            LogicalType::Time {
2223                is_adjusted_to_u_t_c: false,
2224                unit: TimeUnit::MICROS,
2225            },
2226            LogicalType::Time {
2227                is_adjusted_to_u_t_c: true,
2228                unit: TimeUnit::NANOS,
2229            },
2230            LogicalType::Timestamp {
2231                is_adjusted_to_u_t_c: false,
2232                unit: TimeUnit::MILLIS,
2233            },
2234            LogicalType::Timestamp {
2235                is_adjusted_to_u_t_c: false,
2236                unit: TimeUnit::MICROS,
2237            },
2238            LogicalType::Timestamp {
2239                is_adjusted_to_u_t_c: true,
2240                unit: TimeUnit::NANOS,
2241            },
2242            LogicalType::Float16,
2243        ];
2244        check_sort_order(signed, SortOrder::SIGNED);
2245
2246        // Undefined comparison
2247        let undefined = vec![
2248            LogicalType::List,
2249            LogicalType::Map,
2250            LogicalType::Geometry { crs: None },
2251            LogicalType::Geography {
2252                crs: None,
2253                algorithm: Some(EdgeInterpolationAlgorithm::default()),
2254            },
2255        ];
2256        check_sort_order(undefined, SortOrder::UNDEFINED);
2257    }
2258
2259    #[test]
2260    fn test_column_order_get_converted_type_sort_order() {
2261        // Helper to check the order in a list of values.
2262        // Only converted type is checked.
2263        fn check_sort_order(types: Vec<ConvertedType>, expected_order: SortOrder) {
2264            for tpe in types {
2265                assert_eq!(
2266                    ColumnOrder::get_sort_order(None, tpe, Type::BYTE_ARRAY),
2267                    expected_order
2268                );
2269            }
2270        }
2271
2272        // Unsigned comparison (physical type does not matter)
2273        let unsigned = vec![
2274            ConvertedType::UTF8,
2275            ConvertedType::JSON,
2276            ConvertedType::BSON,
2277            ConvertedType::ENUM,
2278            ConvertedType::UINT_8,
2279            ConvertedType::UINT_16,
2280            ConvertedType::UINT_32,
2281            ConvertedType::UINT_64,
2282        ];
2283        check_sort_order(unsigned, SortOrder::UNSIGNED);
2284
2285        // Signed comparison (physical type does not matter)
2286        let signed = vec![
2287            ConvertedType::INT_8,
2288            ConvertedType::INT_16,
2289            ConvertedType::INT_32,
2290            ConvertedType::INT_64,
2291            ConvertedType::DECIMAL,
2292            ConvertedType::DATE,
2293            ConvertedType::TIME_MILLIS,
2294            ConvertedType::TIME_MICROS,
2295            ConvertedType::TIMESTAMP_MILLIS,
2296            ConvertedType::TIMESTAMP_MICROS,
2297        ];
2298        check_sort_order(signed, SortOrder::SIGNED);
2299
2300        // Undefined comparison
2301        let undefined = vec![
2302            ConvertedType::LIST,
2303            ConvertedType::MAP,
2304            ConvertedType::MAP_KEY_VALUE,
2305            ConvertedType::INTERVAL,
2306        ];
2307        check_sort_order(undefined, SortOrder::UNDEFINED);
2308
2309        // Check None logical type
2310        // This should return a sort order for byte array type.
2311        check_sort_order(vec![ConvertedType::NONE], SortOrder::UNSIGNED);
2312    }
2313
2314    #[test]
2315    fn test_column_order_get_default_sort_order() {
2316        // Comparison based on physical type
2317        assert_eq!(
2318            ColumnOrder::get_default_sort_order(Type::BOOLEAN),
2319            SortOrder::UNSIGNED
2320        );
2321        assert_eq!(
2322            ColumnOrder::get_default_sort_order(Type::INT32),
2323            SortOrder::SIGNED
2324        );
2325        assert_eq!(
2326            ColumnOrder::get_default_sort_order(Type::INT64),
2327            SortOrder::SIGNED
2328        );
2329        assert_eq!(
2330            ColumnOrder::get_default_sort_order(Type::INT96),
2331            SortOrder::UNDEFINED
2332        );
2333        assert_eq!(
2334            ColumnOrder::get_default_sort_order(Type::FLOAT),
2335            SortOrder::SIGNED
2336        );
2337        assert_eq!(
2338            ColumnOrder::get_default_sort_order(Type::DOUBLE),
2339            SortOrder::SIGNED
2340        );
2341        assert_eq!(
2342            ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY),
2343            SortOrder::UNSIGNED
2344        );
2345        assert_eq!(
2346            ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY),
2347            SortOrder::UNSIGNED
2348        );
2349    }
2350
2351    #[test]
2352    fn test_column_order_sort_order() {
2353        assert_eq!(
2354            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).sort_order(),
2355            SortOrder::SIGNED
2356        );
2357        assert_eq!(
2358            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).sort_order(),
2359            SortOrder::UNSIGNED
2360        );
2361        assert_eq!(
2362            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).sort_order(),
2363            SortOrder::UNDEFINED
2364        );
2365        assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED);
2366    }
2367
2368    #[test]
2369    fn test_parse_encoding() {
2370        let mut encoding: Encoding = "PLAIN".parse().unwrap();
2371        assert_eq!(encoding, Encoding::PLAIN);
2372        encoding = "PLAIN_DICTIONARY".parse().unwrap();
2373        assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
2374        encoding = "RLE".parse().unwrap();
2375        assert_eq!(encoding, Encoding::RLE);
2376        encoding = "BIT_PACKED".parse().unwrap();
2377        assert_eq!(encoding, Encoding::BIT_PACKED);
2378        encoding = "DELTA_BINARY_PACKED".parse().unwrap();
2379        assert_eq!(encoding, Encoding::DELTA_BINARY_PACKED);
2380        encoding = "DELTA_LENGTH_BYTE_ARRAY".parse().unwrap();
2381        assert_eq!(encoding, Encoding::DELTA_LENGTH_BYTE_ARRAY);
2382        encoding = "DELTA_BYTE_ARRAY".parse().unwrap();
2383        assert_eq!(encoding, Encoding::DELTA_BYTE_ARRAY);
2384        encoding = "RLE_DICTIONARY".parse().unwrap();
2385        assert_eq!(encoding, Encoding::RLE_DICTIONARY);
2386        encoding = "BYTE_STREAM_SPLIT".parse().unwrap();
2387        assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
2388
2389        // test lowercase
2390        encoding = "byte_stream_split".parse().unwrap();
2391        assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
2392
2393        // test unknown string
2394        match "plain_xxx".parse::<Encoding>() {
2395            Ok(e) => {
2396                panic!("Should not be able to parse {e:?}");
2397            }
2398            Err(e) => {
2399                assert_eq!(e.to_string(), "Parquet error: unknown encoding: plain_xxx");
2400            }
2401        }
2402    }
2403
2404    #[test]
2405    fn test_parse_compression() {
2406        let mut compress: Compression = "snappy".parse().unwrap();
2407        assert_eq!(compress, Compression::SNAPPY);
2408        compress = "lzo".parse().unwrap();
2409        assert_eq!(compress, Compression::LZO);
2410        compress = "zstd(3)".parse().unwrap();
2411        assert_eq!(compress, Compression::ZSTD(ZstdLevel::try_new(3).unwrap()));
2412        compress = "LZ4_RAW".parse().unwrap();
2413        assert_eq!(compress, Compression::LZ4_RAW);
2414        compress = "uncompressed".parse().unwrap();
2415        assert_eq!(compress, Compression::UNCOMPRESSED);
2416        compress = "snappy".parse().unwrap();
2417        assert_eq!(compress, Compression::SNAPPY);
2418        compress = "gzip(9)".parse().unwrap();
2419        assert_eq!(compress, Compression::GZIP(GzipLevel::try_new(9).unwrap()));
2420        compress = "lzo".parse().unwrap();
2421        assert_eq!(compress, Compression::LZO);
2422        compress = "brotli(3)".parse().unwrap();
2423        assert_eq!(
2424            compress,
2425            Compression::BROTLI(BrotliLevel::try_new(3).unwrap())
2426        );
2427        compress = "lz4".parse().unwrap();
2428        assert_eq!(compress, Compression::LZ4);
2429
2430        // test unknown compression
2431        let mut err = "plain_xxx".parse::<Encoding>().unwrap_err();
2432        assert_eq!(
2433            err.to_string(),
2434            "Parquet error: unknown encoding: plain_xxx"
2435        );
2436
2437        // test invalid compress level
2438        err = "gzip(-10)".parse::<Encoding>().unwrap_err();
2439        assert_eq!(
2440            err.to_string(),
2441            "Parquet error: unknown encoding: gzip(-10)"
2442        );
2443    }
2444
2445    #[test]
2446    fn test_display_boundary_order() {
2447        assert_eq!(BoundaryOrder::ASCENDING.to_string(), "ASCENDING");
2448        assert_eq!(BoundaryOrder::DESCENDING.to_string(), "DESCENDING");
2449        assert_eq!(BoundaryOrder::UNORDERED.to_string(), "UNORDERED");
2450    }
2451
2452    #[test]
2453    fn test_display_edge_algo() {
2454        assert_eq!(
2455            EdgeInterpolationAlgorithm::SPHERICAL.to_string(),
2456            "SPHERICAL"
2457        );
2458        assert_eq!(EdgeInterpolationAlgorithm::VINCENTY.to_string(), "VINCENTY");
2459        assert_eq!(EdgeInterpolationAlgorithm::THOMAS.to_string(), "THOMAS");
2460        assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER");
2461        assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY");
2462    }
2463
2464    fn encodings_roundtrip(mut encodings: Vec<Encoding>) {
2465        encodings.sort();
2466        let mask = EncodingMask::new_from_encodings(encodings.iter());
2467        assert!(mask.all_set(encodings.iter()));
2468        let v = mask.encodings().collect::<Vec<_>>();
2469        assert_eq!(v, encodings);
2470    }
2471
2472    #[test]
2473    fn test_encoding_roundtrip() {
2474        encodings_roundtrip(
2475            [
2476                Encoding::RLE,
2477                Encoding::PLAIN,
2478                Encoding::DELTA_BINARY_PACKED,
2479            ]
2480            .into(),
2481        );
2482        encodings_roundtrip([Encoding::RLE_DICTIONARY, Encoding::PLAIN_DICTIONARY].into());
2483        encodings_roundtrip([].into());
2484        let encodings = [
2485            Encoding::PLAIN,
2486            Encoding::BIT_PACKED,
2487            Encoding::RLE,
2488            Encoding::DELTA_BINARY_PACKED,
2489            Encoding::DELTA_BYTE_ARRAY,
2490            Encoding::DELTA_LENGTH_BYTE_ARRAY,
2491            Encoding::PLAIN_DICTIONARY,
2492            Encoding::RLE_DICTIONARY,
2493            Encoding::BYTE_STREAM_SPLIT,
2494        ];
2495        encodings_roundtrip(encodings.into());
2496    }
2497
2498    #[test]
2499    fn test_invalid_encoding_mask() {
2500        // any set bits higher than the max should trigger an error
2501        let res = EncodingMask::try_new(-1);
2502        assert!(res.is_err());
2503        let err = res.unwrap_err();
2504        assert_eq!(
2505            err.to_string(),
2506            "Parquet error: Attempt to create invalid mask: 0xffffffff"
2507        );
2508
2509        // test that GROUP_VAR_INT is disallowed
2510        let res = EncodingMask::try_new(2);
2511        assert!(res.is_err());
2512        let err = res.unwrap_err();
2513        assert_eq!(
2514            err.to_string(),
2515            "Parquet error: Attempt to create invalid mask: 0x2"
2516        );
2517    }
2518
2519    #[test]
2520    fn test_encoding_mask_is_only() {
2521        let mask = EncodingMask::new_from_encodings([Encoding::PLAIN].iter());
2522        assert!(mask.is_only(Encoding::PLAIN));
2523
2524        let mask =
2525            EncodingMask::new_from_encodings([Encoding::PLAIN, Encoding::PLAIN_DICTIONARY].iter());
2526        assert!(!mask.is_only(Encoding::PLAIN));
2527    }
2528}