parquet/
basic.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains Rust mappings for Thrift definition. This module contains only mappings for thrift
19//! enums and unions. Thrift structs are handled elsewhere.
20//! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift)
21//! file to see raw definitions.
22
23use std::io::Write;
24use std::str::FromStr;
25use std::{fmt, str};
26
27pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
28use crate::file::metadata::HeapSize;
29use crate::parquet_thrift::{
30    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
31    WriteThrift, WriteThriftField,
32};
33use crate::{thrift_enum, thrift_struct, thrift_union_all_empty, write_thrift_field};
34
35use crate::errors::{ParquetError, Result};
36
37// ----------------------------------------------------------------------
38// Types from the Thrift definition
39
40// ----------------------------------------------------------------------
41// Mirrors thrift enum `Type`
42
43thrift_enum!(
44/// Types supported by Parquet.
45///
46/// These physical types are intended to be used in combination with the encodings to
47/// control the on disk storage format.
48/// For example INT16 is not included as a type since a good encoding of INT32
49/// would handle this.
50enum Type {
51  BOOLEAN = 0;
52  INT32 = 1;
53  INT64 = 2;
54  INT96 = 3;  // deprecated, only used by legacy implementations.
55  FLOAT = 4;
56  DOUBLE = 5;
57  BYTE_ARRAY = 6;
58  FIXED_LEN_BYTE_ARRAY = 7;
59}
60);
61
62// ----------------------------------------------------------------------
63// Mirrors thrift enum `ConvertedType`
64
65// TODO(ets): Adding the `NONE` variant to this enum is a bit awkward. We should
66// look into removing it and using `Option<ConvertedType>` instead.
67thrift_enum!(
68/// Common types (converted types) used by frameworks when using Parquet.
69///
70/// This helps map between types in those frameworks to the base types in Parquet.
71/// This is only metadata and not needed to read or write the data.
72///
73/// This struct was renamed from `LogicalType` in version 4.0.0.
74/// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead.
75enum ConvertedType {
76  /// Not defined in the spec, used internally to indicate no type conversion
77  NONE = -1;
78
79  /// A BYTE_ARRAY actually contains UTF8 encoded chars.
80  UTF8 = 0;
81
82  /// A map is converted as an optional field containing a repeated key/value pair.
83  MAP = 1;
84
85  /// A key/value pair is converted into a group of two fields.
86  MAP_KEY_VALUE = 2;
87
88  /// A list is converted into an optional field containing a repeated field for its
89  /// values.
90  LIST = 3;
91
92  /// An enum is converted into a BYTE_ARRAY field
93  ENUM = 4;
94
95  /// A decimal value.
96  ///
97  /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive
98  /// types. The underlying byte array stores the unscaled value encoded as two's
99  /// complement using big-endian byte order (the most significant byte is the
100  /// zeroth element). The value of the decimal is the value * 10^{-scale}.
101  ///
102  /// This must be accompanied by a (maximum) precision and a scale in the
103  /// SchemaElement. The precision specifies the number of digits in the decimal
104  /// and the scale stores the location of the decimal point. For example 1.23
105  /// would have precision 3 (3 total digits) and scale 2 (the decimal point is
106  /// 2 digits over).
107  DECIMAL = 5;
108
109  /// A date stored as days since Unix epoch, encoded as the INT32 physical type.
110  DATE = 6;
111
112  /// The total number of milliseconds since midnight. The value is stored as an INT32
113  /// physical type.
114  TIME_MILLIS = 7;
115
116  /// The total number of microseconds since midnight. The value is stored as an INT64
117  /// physical type.
118  TIME_MICROS = 8;
119
120  /// Date and time recorded as milliseconds since the Unix epoch.
121  /// Recorded as a physical type of INT64.
122  TIMESTAMP_MILLIS = 9;
123
124  /// Date and time recorded as microseconds since the Unix epoch.
125  /// The value is stored as an INT64 physical type.
126  TIMESTAMP_MICROS = 10;
127
128  /// An unsigned 8 bit integer value stored as INT32 physical type.
129  UINT_8 = 11;
130
131  /// An unsigned 16 bit integer value stored as INT32 physical type.
132  UINT_16 = 12;
133
134  /// An unsigned 32 bit integer value stored as INT32 physical type.
135  UINT_32 = 13;
136
137  /// An unsigned 64 bit integer value stored as INT64 physical type.
138  UINT_64 = 14;
139
140  /// A signed 8 bit integer value stored as INT32 physical type.
141  INT_8 = 15;
142
143  /// A signed 16 bit integer value stored as INT32 physical type.
144  INT_16 = 16;
145
146  /// A signed 32 bit integer value stored as INT32 physical type.
147  INT_32 = 17;
148
149  /// A signed 64 bit integer value stored as INT64 physical type.
150  INT_64 = 18;
151
152  /// A JSON document embedded within a single UTF8 column.
153  JSON = 19;
154
155   /// A BSON document embedded within a single BINARY column.
156  BSON = 20;
157
158  /// An interval of time
159  ///
160  /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12.
161  /// This data is composed of three separate little endian unsigned integers.
162  /// Each stores a component of a duration of time. The first integer identifies
163  /// the number of months associated with the duration, the second identifies
164  /// the number of days associated with the duration and the third identifies
165  /// the number of milliseconds associated with the provided duration.
166  /// This duration of time is independent of any particular timezone or date.
167  INTERVAL = 21;
168}
169);
170
171// ----------------------------------------------------------------------
172// Mirrors thrift union `TimeUnit`
173
174thrift_union_all_empty!(
175/// Time unit for `Time` and `Timestamp` logical types.
176union TimeUnit {
177  1: MilliSeconds MILLIS
178  2: MicroSeconds MICROS
179  3: NanoSeconds NANOS
180}
181);
182
183// ----------------------------------------------------------------------
184// Mirrors thrift union `LogicalType`
185
186// private structs for decoding logical type
187
188thrift_struct!(
189struct DecimalType {
190  1: required i32 scale
191  2: required i32 precision
192}
193);
194
195thrift_struct!(
196struct TimestampType {
197  1: required bool is_adjusted_to_u_t_c
198  2: required TimeUnit unit
199}
200);
201
202// they are identical
203use TimestampType as TimeType;
204
205thrift_struct!(
206struct IntType {
207  1: required i8 bit_width
208  2: required bool is_signed
209}
210);
211
212thrift_struct!(
213struct VariantType {
214  // The version of the variant specification that the variant was
215  // written with.
216  1: optional i8 specification_version
217}
218);
219
220thrift_struct!(
221struct GeometryType<'a> {
222  1: optional string<'a> crs;
223}
224);
225
226thrift_struct!(
227struct GeographyType<'a> {
228  1: optional string<'a> crs;
229  2: optional EdgeInterpolationAlgorithm algorithm;
230}
231);
232
233// TODO(ets): should we switch to tuple variants so we can use
234// the thrift macros?
235
236/// Logical types used by version 2.4.0+ of the Parquet format.
237///
238/// This is an *entirely new* struct as of version
239/// 4.0.0. The struct previously named `LogicalType` was renamed to
240/// [`ConvertedType`]. Please see the README.md for more details.
241#[derive(Debug, Clone, PartialEq, Eq)]
242pub enum LogicalType {
243    /// A UTF8 encoded string.
244    String,
245    /// A map of key-value pairs.
246    Map,
247    /// A list of elements.
248    List,
249    /// A set of predefined values.
250    Enum,
251    /// A decimal value with a specified scale and precision.
252    Decimal {
253        /// The number of digits in the decimal.
254        scale: i32,
255        /// The location of the decimal point.
256        precision: i32,
257    },
258    /// A date stored as days since Unix epoch.
259    Date,
260    /// A time stored as [`TimeUnit`] since midnight.
261    Time {
262        /// Whether the time is adjusted to UTC.
263        is_adjusted_to_u_t_c: bool,
264        /// The unit of time.
265        unit: TimeUnit,
266    },
267    /// A timestamp stored as [`TimeUnit`] since Unix epoch.
268    Timestamp {
269        /// Whether the timestamp is adjusted to UTC.
270        is_adjusted_to_u_t_c: bool,
271        /// The unit of time.
272        unit: TimeUnit,
273    },
274    /// An integer with a specified bit width and signedness.
275    Integer {
276        /// The number of bits in the integer.
277        bit_width: i8,
278        /// Whether the integer is signed.
279        is_signed: bool,
280    },
281    /// An unknown logical type.
282    Unknown,
283    /// A JSON document.
284    Json,
285    /// A BSON document.
286    Bson,
287    /// A UUID.
288    Uuid,
289    /// A 16-bit floating point number.
290    Float16,
291    /// A Variant value.
292    Variant {
293        /// The version of the variant specification that the variant was written with.
294        specification_version: Option<i8>,
295    },
296    /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation.
297    Geometry {
298        /// A custom CRS. If unset the defaults to `OGC:CRS84`, which means that the geometries
299        /// must be stored in longitude, latitude based on the WGS84 datum.
300        crs: Option<String>,
301    },
302    /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation.
303    Geography {
304        /// A custom CRS. If unset the defaults to `OGC:CRS84`.
305        crs: Option<String>,
306        /// An optional algorithm can be set to correctly interpret edges interpolation
307        /// of the geometries. If unset, the algorithm defaults to `SPHERICAL`.
308        algorithm: Option<EdgeInterpolationAlgorithm>,
309    },
310    /// For forward compatibility; used when an unknown union value is encountered.
311    _Unknown {
312        /// The field id encountered when parsing the unknown logical type.
313        field_id: i16,
314    },
315}
316
317impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {
318    fn read_thrift(prot: &mut R) -> Result<Self> {
319        let field_ident = prot.read_field_begin(0)?;
320        if field_ident.field_type == FieldType::Stop {
321            return Err(general_err!("received empty union from remote LogicalType"));
322        }
323        let ret = match field_ident.id {
324            1 => {
325                prot.skip_empty_struct()?;
326                Self::String
327            }
328            2 => {
329                prot.skip_empty_struct()?;
330                Self::Map
331            }
332            3 => {
333                prot.skip_empty_struct()?;
334                Self::List
335            }
336            4 => {
337                prot.skip_empty_struct()?;
338                Self::Enum
339            }
340            5 => {
341                let val = DecimalType::read_thrift(&mut *prot)?;
342                Self::Decimal {
343                    scale: val.scale,
344                    precision: val.precision,
345                }
346            }
347            6 => {
348                prot.skip_empty_struct()?;
349                Self::Date
350            }
351            7 => {
352                let val = TimeType::read_thrift(&mut *prot)?;
353                Self::Time {
354                    is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c,
355                    unit: val.unit,
356                }
357            }
358            8 => {
359                let val = TimestampType::read_thrift(&mut *prot)?;
360                Self::Timestamp {
361                    is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c,
362                    unit: val.unit,
363                }
364            }
365            10 => {
366                let val = IntType::read_thrift(&mut *prot)?;
367                Self::Integer {
368                    is_signed: val.is_signed,
369                    bit_width: val.bit_width,
370                }
371            }
372            11 => {
373                prot.skip_empty_struct()?;
374                Self::Unknown
375            }
376            12 => {
377                prot.skip_empty_struct()?;
378                Self::Json
379            }
380            13 => {
381                prot.skip_empty_struct()?;
382                Self::Bson
383            }
384            14 => {
385                prot.skip_empty_struct()?;
386                Self::Uuid
387            }
388            15 => {
389                prot.skip_empty_struct()?;
390                Self::Float16
391            }
392            16 => {
393                let val = VariantType::read_thrift(&mut *prot)?;
394                Self::Variant {
395                    specification_version: val.specification_version,
396                }
397            }
398            17 => {
399                let val = GeometryType::read_thrift(&mut *prot)?;
400                Self::Geometry {
401                    crs: val.crs.map(|s| s.to_owned()),
402                }
403            }
404            18 => {
405                let val = GeographyType::read_thrift(&mut *prot)?;
406                // unset algorithm means SPHERICAL, per the spec:
407                // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography
408                let algorithm = val
409                    .algorithm
410                    .unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL);
411                Self::Geography {
412                    crs: val.crs.map(|s| s.to_owned()),
413                    algorithm: Some(algorithm),
414                }
415            }
416            _ => {
417                prot.skip(field_ident.field_type)?;
418                Self::_Unknown {
419                    field_id: field_ident.id,
420                }
421            }
422        };
423        let field_ident = prot.read_field_begin(field_ident.id)?;
424        if field_ident.field_type != FieldType::Stop {
425            return Err(general_err!(
426                "Received multiple fields for union from remote LogicalType"
427            ));
428        }
429        Ok(ret)
430    }
431}
432
433impl WriteThrift for LogicalType {
434    const ELEMENT_TYPE: ElementType = ElementType::Struct;
435
436    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
437        match self {
438            Self::String => {
439                writer.write_empty_struct(1, 0)?;
440            }
441            Self::Map => {
442                writer.write_empty_struct(2, 0)?;
443            }
444            Self::List => {
445                writer.write_empty_struct(3, 0)?;
446            }
447            Self::Enum => {
448                writer.write_empty_struct(4, 0)?;
449            }
450            Self::Decimal { scale, precision } => {
451                DecimalType {
452                    scale: *scale,
453                    precision: *precision,
454                }
455                .write_thrift_field(writer, 5, 0)?;
456            }
457            Self::Date => {
458                writer.write_empty_struct(6, 0)?;
459            }
460            Self::Time {
461                is_adjusted_to_u_t_c,
462                unit,
463            } => {
464                TimeType {
465                    is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c,
466                    unit: *unit,
467                }
468                .write_thrift_field(writer, 7, 0)?;
469            }
470            Self::Timestamp {
471                is_adjusted_to_u_t_c,
472                unit,
473            } => {
474                TimestampType {
475                    is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c,
476                    unit: *unit,
477                }
478                .write_thrift_field(writer, 8, 0)?;
479            }
480            Self::Integer {
481                bit_width,
482                is_signed,
483            } => {
484                IntType {
485                    bit_width: *bit_width,
486                    is_signed: *is_signed,
487                }
488                .write_thrift_field(writer, 10, 0)?;
489            }
490            Self::Unknown => {
491                writer.write_empty_struct(11, 0)?;
492            }
493            Self::Json => {
494                writer.write_empty_struct(12, 0)?;
495            }
496            Self::Bson => {
497                writer.write_empty_struct(13, 0)?;
498            }
499            Self::Uuid => {
500                writer.write_empty_struct(14, 0)?;
501            }
502            Self::Float16 => {
503                writer.write_empty_struct(15, 0)?;
504            }
505            Self::Variant {
506                specification_version,
507            } => {
508                VariantType {
509                    specification_version: *specification_version,
510                }
511                .write_thrift_field(writer, 16, 0)?;
512            }
513            Self::Geometry { crs } => {
514                GeometryType {
515                    crs: crs.as_ref().map(|s| s.as_str()),
516                }
517                .write_thrift_field(writer, 17, 0)?;
518            }
519            Self::Geography { crs, algorithm } => {
520                GeographyType {
521                    crs: crs.as_ref().map(|s| s.as_str()),
522                    algorithm: *algorithm,
523                }
524                .write_thrift_field(writer, 18, 0)?;
525            }
526            _ => return Err(nyi_err!("logical type")),
527        }
528        writer.write_struct_end()
529    }
530}
531
532write_thrift_field!(LogicalType, FieldType::Struct);
533
534// ----------------------------------------------------------------------
535// Mirrors thrift enum `FieldRepetitionType`
536//
537
538thrift_enum!(
539/// Representation of field types in schema.
540enum FieldRepetitionType {
541  /// This field is required (can not be null) and each row has exactly 1 value.
542  REQUIRED = 0;
543  /// The field is optional (can be null) and each row has 0 or 1 values.
544  OPTIONAL = 1;
545  /// The field is repeated and can contain 0 or more values.
546  REPEATED = 2;
547}
548);
549
550/// Type alias for thrift `FieldRepetitionType`
551pub type Repetition = FieldRepetitionType;
552
553// ----------------------------------------------------------------------
554// Mirrors thrift enum `Encoding`
555
556thrift_enum!(
557/// Encodings supported by Parquet.
558///
559/// Not all encodings are valid for all types. These enums are also used to specify the
560/// encoding of definition and repetition levels.
561///
562/// By default this crate uses [Encoding::PLAIN], [Encoding::RLE], and [Encoding::RLE_DICTIONARY].
563/// These provide very good encode and decode performance, whilst yielding reasonable storage
564/// efficiency and being supported by all major parquet readers.
565///
566/// The delta encodings are also supported and will be used if a newer [WriterVersion] is
567/// configured, however, it should be noted that these sacrifice encode and decode performance for
568/// improved storage efficiency. This performance regression is particularly pronounced in the case
569/// of record skipping as occurs during predicate push-down. It is recommended users assess the
570/// performance impact when evaluating these encodings.
571///
572/// [WriterVersion]: crate::file::properties::WriterVersion
573enum Encoding {
574  /// Default encoding.
575  /// - BOOLEAN - 1 bit per value. 0 is false; 1 is true.
576  /// - INT32 - 4 bytes per value.  Stored as little-endian.
577  /// - INT64 - 8 bytes per value.  Stored as little-endian.
578  /// - FLOAT - 4 bytes per value.  IEEE. Stored as little-endian.
579  /// - DOUBLE - 8 bytes per value.  IEEE. Stored as little-endian.
580  /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
581  /// - FIXED_LEN_BYTE_ARRAY - Just the bytes.
582  PLAIN = 0;
583  //  GROUP_VAR_INT = 1;
584  /// **Deprecated** dictionary encoding.
585  ///
586  /// The values in the dictionary are encoded using PLAIN encoding.
587  /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and
588  /// PLAIN encoding is used for dictionary page.
589  PLAIN_DICTIONARY = 2;
590  /// Group packed run length encoding.
591  ///
592  /// Usable for definition/repetition levels encoding and boolean values.
593  RLE = 3;
594  /// **Deprecated** Bit-packed encoding.
595  ///
596  /// This can only be used if the data has a known max width.
597  /// Usable for definition/repetition levels encoding.
598  ///
599  /// There are compatibility issues with files using this encoding.
600  /// The parquet standard specifies the bits to be packed starting from the
601  /// most-significant bit, several implementations do not follow this bit order.
602  /// Several other implementations also have issues reading this encoding
603  /// because of incorrect assumptions about the length of the encoded data.
604  ///
605  /// The RLE/bit-packing hybrid is more cpu and memory efficient and should be used instead.
606  #[deprecated(
607      since = "51.0.0",
608      note = "Please see documentation for compatibility issues and use the RLE/bit-packing hybrid encoding instead"
609  )]
610  BIT_PACKED = 4;
611  /// Delta encoding for integers, either INT32 or INT64.
612  ///
613  /// Works best on sorted data.
614  DELTA_BINARY_PACKED = 5;
615  /// Encoding for byte arrays to separate the length values and the data.
616  ///
617  /// The lengths are encoded using DELTA_BINARY_PACKED encoding.
618  DELTA_LENGTH_BYTE_ARRAY = 6;
619  /// Incremental encoding for byte arrays.
620  ///
621  /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding.
622  /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding.
623  DELTA_BYTE_ARRAY = 7;
624  /// Dictionary encoding.
625  ///
626  /// The ids are encoded using the RLE encoding.
627  RLE_DICTIONARY = 8;
628  /// Encoding for fixed-width data.
629  ///
630  /// K byte-streams are created where K is the size in bytes of the data type.
631  /// The individual bytes of a value are scattered to the corresponding stream and
632  /// the streams are concatenated.
633  /// This itself does not reduce the size of the data but can lead to better compression
634  /// afterwards. Note that the use of this encoding with FIXED_LEN_BYTE_ARRAY(N) data may
635  /// perform poorly for large values of N.
636  BYTE_STREAM_SPLIT = 9;
637}
638);
639
640impl FromStr for Encoding {
641    type Err = ParquetError;
642
643    fn from_str(s: &str) -> Result<Self, Self::Err> {
644        match s {
645            "PLAIN" | "plain" => Ok(Encoding::PLAIN),
646            "PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY),
647            "RLE" | "rle" => Ok(Encoding::RLE),
648            #[allow(deprecated)]
649            "BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
650            "DELTA_BINARY_PACKED" | "delta_binary_packed" => Ok(Encoding::DELTA_BINARY_PACKED),
651            "DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
652                Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY)
653            }
654            "DELTA_BYTE_ARRAY" | "delta_byte_array" => Ok(Encoding::DELTA_BYTE_ARRAY),
655            "RLE_DICTIONARY" | "rle_dictionary" => Ok(Encoding::RLE_DICTIONARY),
656            "BYTE_STREAM_SPLIT" | "byte_stream_split" => Ok(Encoding::BYTE_STREAM_SPLIT),
657            _ => Err(general_err!("unknown encoding: {}", s)),
658        }
659    }
660}
661
662/// A bitmask representing the [`Encoding`]s employed while encoding a Parquet column chunk.
663///
664/// The Parquet [`ColumnMetaData`] struct contains an array that indicates what encodings were
665/// used when writing that column chunk. For memory and performance reasons, this crate reduces
666/// that array to bitmask, where each bit position represents a different [`Encoding`]. This
667/// struct contains that bitmask, and provides methods to interact with the data.
668///
669/// # Example
670/// ```no_run
671/// # use parquet::file::metadata::ParquetMetaDataReader;
672/// # use parquet::basic::Encoding;
673/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
674/// // read parquet metadata from a file
675/// let file = open_parquet_file("some_path.parquet");
676/// let mut reader = ParquetMetaDataReader::new();
677/// reader.try_parse(&file).unwrap();
678/// let metadata = reader.finish().unwrap();
679///
680/// // find the encodings used by the first column chunk in the first row group
681/// let col_meta = metadata.row_group(0).column(0);
682/// let encodings = col_meta.encodings_mask();
683///
684/// // check to see if a particular encoding was used
685/// let used_rle = encodings.is_set(Encoding::RLE);
686///
687/// // check to see if all of a set of encodings were used
688/// let used_all = encodings.all_set([Encoding::RLE, Encoding::PLAIN].iter());
689///
690/// // convert mask to a Vec<Encoding>
691/// let encodings_vec = encodings.encodings().collect::<Vec<_>>();
692/// ```
693///
694/// [`ColumnMetaData`]: https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/src/main/thrift/parquet.thrift#L875
695#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
696pub struct EncodingMask(i32);
697
698impl EncodingMask {
699    /// Highest valued discriminant in the [`Encoding`] enum
700    const MAX_ENCODING: i32 = Encoding::MAX_DISCRIMINANT;
701    /// A mask consisting of unused bit positions, used for validation. This includes the never
702    /// used GROUP_VAR_INT encoding value of `1`.
703    const ALLOWED_MASK: u32 =
704        !(1u32 << (EncodingMask::MAX_ENCODING as u32 + 1)).wrapping_sub(1) | 1 << 1;
705
706    /// Attempt to create a new `EncodingMask` from an integer.
707    ///
708    /// This will return an error if a bit outside the allowable range is set.
709    pub fn try_new(val: i32) -> Result<Self> {
710        if val as u32 & Self::ALLOWED_MASK != 0 {
711            return Err(general_err!("Attempt to create invalid mask: 0x{:x}", val));
712        }
713        Ok(Self(val))
714    }
715
716    /// Return an integer representation of this `EncodingMask`.
717    pub fn as_i32(&self) -> i32 {
718        self.0
719    }
720
721    /// Create a new `EncodingMask` from a collection of [`Encoding`]s.
722    pub fn new_from_encodings<'a>(encodings: impl Iterator<Item = &'a Encoding>) -> Self {
723        let mut mask = 0;
724        for &e in encodings {
725            mask |= 1 << (e as i32);
726        }
727        Self(mask)
728    }
729
730    /// Mark the given [`Encoding`] as present in this mask.
731    pub fn insert(&mut self, val: Encoding) {
732        self.0 |= 1 << (val as i32);
733    }
734
735    /// Test if a given [`Encoding`] is present in this mask.
736    pub fn is_set(&self, val: Encoding) -> bool {
737        self.0 & (1 << (val as i32)) != 0
738    }
739
740    /// Test if all [`Encoding`]s in a given set are present in this mask.
741    pub fn all_set<'a>(&self, mut encodings: impl Iterator<Item = &'a Encoding>) -> bool {
742        encodings.all(|&e| self.is_set(e))
743    }
744
745    /// Return an iterator over all [`Encoding`]s present in this mask.
746    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
747        Self::mask_to_encodings_iter(self.0)
748    }
749
750    fn mask_to_encodings_iter(mask: i32) -> impl Iterator<Item = Encoding> {
751        (0..=Self::MAX_ENCODING)
752            .filter(move |i| mask & (1 << i) != 0)
753            .map(i32_to_encoding)
754    }
755}
756
757impl HeapSize for EncodingMask {
758    fn heap_size(&self) -> usize {
759        0 // no heap allocations
760    }
761}
762
763impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EncodingMask {
764    fn read_thrift(prot: &mut R) -> Result<Self> {
765        let mut mask = 0;
766
767        // This reads a Thrift `list<Encoding>` and turns it into a bitmask
768        let list_ident = prot.read_list_begin()?;
769        for _ in 0..list_ident.size {
770            let val = Encoding::read_thrift(prot)?;
771            mask |= 1 << val as i32;
772        }
773        Ok(Self(mask))
774    }
775}
776
777#[allow(deprecated)]
778fn i32_to_encoding(val: i32) -> Encoding {
779    match val {
780        0 => Encoding::PLAIN,
781        2 => Encoding::PLAIN_DICTIONARY,
782        3 => Encoding::RLE,
783        4 => Encoding::BIT_PACKED,
784        5 => Encoding::DELTA_BINARY_PACKED,
785        6 => Encoding::DELTA_LENGTH_BYTE_ARRAY,
786        7 => Encoding::DELTA_BYTE_ARRAY,
787        8 => Encoding::RLE_DICTIONARY,
788        9 => Encoding::BYTE_STREAM_SPLIT,
789        _ => panic!("Impossible encoding {val}"),
790    }
791}
792
793// ----------------------------------------------------------------------
794// Mirrors thrift enum `CompressionCodec`
795
796/// Supported block compression algorithms.
797///
798/// Block compression can yield non-trivial improvements to storage efficiency at the expense
799/// of potentially significantly worse encode and decode performance. Many applications,
800/// especially those making use of high-throughput and low-cost commodity object storage,
801/// may find storage efficiency less important than decode throughput, and therefore may
802/// wish to not make use of block compression.
803///
804/// The writers in this crate default to no block compression for this reason.
805///
806/// Applications that do still wish to use block compression, will find [`Compression::ZSTD`]
807/// to provide a good balance of compression, performance, and ecosystem support. Alternatively,
808/// [`Compression::LZ4_RAW`] provides much faster decompression speeds, at the cost of typically
809/// worse compression ratios. However, it is not as widely supported by the ecosystem, with the
810/// Hadoop ecosystem historically favoring the non-standard and now deprecated [`Compression::LZ4`].
811#[derive(Debug, Clone, Copy, PartialEq, Eq)]
812#[allow(non_camel_case_types)]
813pub enum Compression {
814    /// No compression.
815    UNCOMPRESSED,
816    /// [Snappy compression](https://en.wikipedia.org/wiki/Snappy_(compression))
817    SNAPPY,
818    /// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
819    GZIP(GzipLevel),
820    /// [LZO compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
821    LZO,
822    /// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
823    BROTLI(BrotliLevel),
824    /// [LZ4 compression](https://lz4.org/), [(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
825    LZ4,
826    /// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
827    ZSTD(ZstdLevel),
828    /// [LZ4 compression](https://lz4.org/).
829    LZ4_RAW,
830}
831
832impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for Compression {
833    fn read_thrift(prot: &mut R) -> Result<Self> {
834        let val = prot.read_i32()?;
835        Ok(match val {
836            0 => Self::UNCOMPRESSED,
837            1 => Self::SNAPPY,
838            2 => Self::GZIP(Default::default()),
839            3 => Self::LZO,
840            4 => Self::BROTLI(Default::default()),
841            5 => Self::LZ4,
842            6 => Self::ZSTD(Default::default()),
843            7 => Self::LZ4_RAW,
844            _ => return Err(general_err!("Unexpected CompressionCodec {}", val)),
845        })
846    }
847}
848
849// TODO(ets): explore replacing this with a thrift_enum!(ThriftCompression) for the serialization
850// and then provide `From` impls to convert back and forth. This is necessary due to the addition
851// of compression level to some variants.
852impl WriteThrift for Compression {
853    const ELEMENT_TYPE: ElementType = ElementType::I32;
854
855    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
856        let id: i32 = match *self {
857            Self::UNCOMPRESSED => 0,
858            Self::SNAPPY => 1,
859            Self::GZIP(_) => 2,
860            Self::LZO => 3,
861            Self::BROTLI(_) => 4,
862            Self::LZ4 => 5,
863            Self::ZSTD(_) => 6,
864            Self::LZ4_RAW => 7,
865        };
866        writer.write_i32(id)
867    }
868}
869
870write_thrift_field!(Compression, FieldType::I32);
871
872impl Compression {
873    /// Returns the codec type of this compression setting as a string, without the compression
874    /// level.
875    pub(crate) fn codec_to_string(self) -> String {
876        format!("{self:?}").split('(').next().unwrap().to_owned()
877    }
878}
879
880fn split_compression_string(str_setting: &str) -> Result<(&str, Option<u32>), ParquetError> {
881    let split_setting = str_setting.split_once('(');
882
883    match split_setting {
884        Some((codec, level_str)) => {
885            let level = &level_str[..level_str.len() - 1]
886                .parse::<u32>()
887                .map_err(|_| {
888                    ParquetError::General(format!("invalid compression level: {level_str}"))
889                })?;
890            Ok((codec, Some(*level)))
891        }
892        None => Ok((str_setting, None)),
893    }
894}
895
896fn check_level_is_none(level: &Option<u32>) -> Result<(), ParquetError> {
897    if level.is_some() {
898        return Err(ParquetError::General(
899            "compression level is not supported".to_string(),
900        ));
901    }
902
903    Ok(())
904}
905
906fn require_level(codec: &str, level: Option<u32>) -> Result<u32, ParquetError> {
907    level.ok_or(ParquetError::General(format!(
908        "{codec} requires a compression level",
909    )))
910}
911
912impl FromStr for Compression {
913    type Err = ParquetError;
914
915    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
916        let (codec, level) = split_compression_string(s)?;
917
918        let c = match codec {
919            "UNCOMPRESSED" | "uncompressed" => {
920                check_level_is_none(&level)?;
921                Compression::UNCOMPRESSED
922            }
923            "SNAPPY" | "snappy" => {
924                check_level_is_none(&level)?;
925                Compression::SNAPPY
926            }
927            "GZIP" | "gzip" => {
928                let level = require_level(codec, level)?;
929                Compression::GZIP(GzipLevel::try_new(level)?)
930            }
931            "LZO" | "lzo" => {
932                check_level_is_none(&level)?;
933                Compression::LZO
934            }
935            "BROTLI" | "brotli" => {
936                let level = require_level(codec, level)?;
937                Compression::BROTLI(BrotliLevel::try_new(level)?)
938            }
939            "LZ4" | "lz4" => {
940                check_level_is_none(&level)?;
941                Compression::LZ4
942            }
943            "ZSTD" | "zstd" => {
944                let level = require_level(codec, level)?;
945                Compression::ZSTD(ZstdLevel::try_new(level as i32)?)
946            }
947            "LZ4_RAW" | "lz4_raw" => {
948                check_level_is_none(&level)?;
949                Compression::LZ4_RAW
950            }
951            _ => {
952                return Err(ParquetError::General(format!(
953                    "unsupport compression {codec}"
954                )));
955            }
956        };
957
958        Ok(c)
959    }
960}
961
962// ----------------------------------------------------------------------
963// Mirrors thrift enum `PageType`
964
965thrift_enum!(
966/// Available data pages for Parquet file format.
967/// Note that some of the page types may not be supported.
968enum PageType {
969  DATA_PAGE = 0;
970  INDEX_PAGE = 1;
971  DICTIONARY_PAGE = 2;
972  DATA_PAGE_V2 = 3;
973}
974);
975
976// ----------------------------------------------------------------------
977// Mirrors thrift enum `BoundaryOrder`
978
979thrift_enum!(
980/// Enum to annotate whether lists of min/max elements inside ColumnIndex
981/// are ordered and if so, in which direction.
982enum BoundaryOrder {
983  UNORDERED = 0;
984  ASCENDING = 1;
985  DESCENDING = 2;
986}
987);
988
989// ----------------------------------------------------------------------
990// Mirrors thrift enum `EdgeInterpolationAlgorithm`
991
992// this is hand coded to allow for the _Unknown variant (allows this to be forward compatible)
993
994/// Edge interpolation algorithm for [`LogicalType::Geography`]
995#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
996#[repr(i32)]
997#[derive(Default)]
998pub enum EdgeInterpolationAlgorithm {
999    /// Edges are interpolated as geodesics on a sphere.
1000    #[default]
1001    SPHERICAL = 0,
1002    /// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
1003    VINCENTY = 1,
1004    /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970
1005    THOMAS = 2,
1006    /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965.
1007    ANDOYER = 3,
1008    /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55
1009    KARNEY = 4,
1010    /// Unknown algorithm
1011    _Unknown(i32),
1012}
1013
1014impl fmt::Display for EdgeInterpolationAlgorithm {
1015    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1016        f.write_fmt(format_args!("{0:?}", self))
1017    }
1018}
1019
1020impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EdgeInterpolationAlgorithm {
1021    fn read_thrift(prot: &mut R) -> Result<Self> {
1022        let val = prot.read_i32()?;
1023        match val {
1024            0 => Ok(Self::SPHERICAL),
1025            1 => Ok(Self::VINCENTY),
1026            2 => Ok(Self::THOMAS),
1027            3 => Ok(Self::ANDOYER),
1028            4 => Ok(Self::KARNEY),
1029            _ => Ok(Self::_Unknown(val)),
1030        }
1031    }
1032}
1033
1034impl WriteThrift for EdgeInterpolationAlgorithm {
1035    const ELEMENT_TYPE: ElementType = ElementType::I32;
1036    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
1037        let val: i32 = match *self {
1038            Self::SPHERICAL => 0,
1039            Self::VINCENTY => 1,
1040            Self::THOMAS => 2,
1041            Self::ANDOYER => 3,
1042            Self::KARNEY => 4,
1043            Self::_Unknown(i) => i,
1044        };
1045        writer.write_i32(val)
1046    }
1047}
1048
1049write_thrift_field!(EdgeInterpolationAlgorithm, FieldType::I32);
1050
1051// ----------------------------------------------------------------------
1052// Mirrors thrift union `BloomFilterAlgorithm`
1053
1054thrift_union_all_empty!(
1055/// The algorithm used in Bloom filter.
1056union BloomFilterAlgorithm {
1057  /// Block-based Bloom filter.
1058  1: SplitBlockAlgorithm BLOCK;
1059}
1060);
1061
1062// ----------------------------------------------------------------------
1063// Mirrors thrift union `BloomFilterHash`
1064
1065thrift_union_all_empty!(
1066/// The hash function used in Bloom filter. This function takes the hash of a column value
1067/// using plain encoding.
1068union BloomFilterHash {
1069  /// xxHash Strategy.
1070  1: XxHash XXHASH;
1071}
1072);
1073
1074// ----------------------------------------------------------------------
1075// Mirrors thrift union `BloomFilterCompression`
1076
1077thrift_union_all_empty!(
1078/// The compression used in the Bloom filter.
1079union BloomFilterCompression {
1080  1: Uncompressed UNCOMPRESSED;
1081}
1082);
1083
1084// ----------------------------------------------------------------------
1085// Mirrors thrift union `ColumnOrder`
1086
1087/// Sort order for page and column statistics.
1088///
1089/// Types are associated with sort orders and column stats are aggregated using a sort
1090/// order, and a sort order should be considered when comparing values with statistics
1091/// min/max.
1092///
1093/// See reference in
1094/// <https://github.com/apache/arrow/blob/main/cpp/src/parquet/types.h>
1095#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1096#[allow(non_camel_case_types)]
1097pub enum SortOrder {
1098    /// Signed (either value or legacy byte-wise) comparison.
1099    SIGNED,
1100    /// Unsigned (depending on physical type either value or byte-wise) comparison.
1101    UNSIGNED,
1102    /// Comparison is undefined.
1103    UNDEFINED,
1104}
1105
1106impl SortOrder {
1107    /// Returns true if this is [`Self::SIGNED`]
1108    pub fn is_signed(&self) -> bool {
1109        matches!(self, Self::SIGNED)
1110    }
1111}
1112
1113/// Column order that specifies what method was used to aggregate min/max values for
1114/// statistics.
1115///
1116/// If column order is undefined, then it is the legacy behaviour and all values should
1117/// be compared as signed values/bytes.
1118#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1119#[allow(non_camel_case_types)]
1120pub enum ColumnOrder {
1121    /// Column uses the order defined by its logical or physical type
1122    /// (if there is no logical type), parquet-format 2.4.0+.
1123    TYPE_DEFINED_ORDER(SortOrder),
1124    // The following are not defined in the Parquet spec and should always be last.
1125    /// Undefined column order, means legacy behaviour before parquet-format 2.4.0.
1126    /// Sort order is always SIGNED.
1127    UNDEFINED,
1128    /// An unknown but present ColumnOrder. Statistics with an unknown `ColumnOrder`
1129    /// will be ignored.
1130    UNKNOWN,
1131}
1132
1133impl ColumnOrder {
1134    /// Returns sort order for a physical/logical type.
1135    #[deprecated(
1136        since = "57.1.0",
1137        note = "use `ColumnOrder::sort_order_for_type` instead"
1138    )]
1139    pub fn get_sort_order(
1140        logical_type: Option<LogicalType>,
1141        converted_type: ConvertedType,
1142        physical_type: Type,
1143    ) -> SortOrder {
1144        Self::sort_order_for_type(logical_type.as_ref(), converted_type, physical_type)
1145    }
1146
1147    /// Returns sort order for a physical/logical type.
1148    pub fn sort_order_for_type(
1149        logical_type: Option<&LogicalType>,
1150        converted_type: ConvertedType,
1151        physical_type: Type,
1152    ) -> SortOrder {
1153        match logical_type {
1154            Some(logical) => match logical {
1155                LogicalType::String | LogicalType::Enum | LogicalType::Json | LogicalType::Bson => {
1156                    SortOrder::UNSIGNED
1157                }
1158                LogicalType::Integer { is_signed, .. } => match is_signed {
1159                    true => SortOrder::SIGNED,
1160                    false => SortOrder::UNSIGNED,
1161                },
1162                LogicalType::Map | LogicalType::List => SortOrder::UNDEFINED,
1163                LogicalType::Decimal { .. } => SortOrder::SIGNED,
1164                LogicalType::Date => SortOrder::SIGNED,
1165                LogicalType::Time { .. } => SortOrder::SIGNED,
1166                LogicalType::Timestamp { .. } => SortOrder::SIGNED,
1167                LogicalType::Unknown => SortOrder::UNDEFINED,
1168                LogicalType::Uuid => SortOrder::UNSIGNED,
1169                LogicalType::Float16 => SortOrder::SIGNED,
1170                LogicalType::Variant { .. }
1171                | LogicalType::Geometry { .. }
1172                | LogicalType::Geography { .. }
1173                | LogicalType::_Unknown { .. } => SortOrder::UNDEFINED,
1174            },
1175            // Fall back to converted type
1176            None => Self::get_converted_sort_order(converted_type, physical_type),
1177        }
1178    }
1179
1180    fn get_converted_sort_order(converted_type: ConvertedType, physical_type: Type) -> SortOrder {
1181        match converted_type {
1182            // Unsigned byte-wise comparison.
1183            ConvertedType::UTF8
1184            | ConvertedType::JSON
1185            | ConvertedType::BSON
1186            | ConvertedType::ENUM => SortOrder::UNSIGNED,
1187
1188            ConvertedType::INT_8
1189            | ConvertedType::INT_16
1190            | ConvertedType::INT_32
1191            | ConvertedType::INT_64 => SortOrder::SIGNED,
1192
1193            ConvertedType::UINT_8
1194            | ConvertedType::UINT_16
1195            | ConvertedType::UINT_32
1196            | ConvertedType::UINT_64 => SortOrder::UNSIGNED,
1197
1198            // Signed comparison of the represented value.
1199            ConvertedType::DECIMAL => SortOrder::SIGNED,
1200
1201            ConvertedType::DATE => SortOrder::SIGNED,
1202
1203            ConvertedType::TIME_MILLIS
1204            | ConvertedType::TIME_MICROS
1205            | ConvertedType::TIMESTAMP_MILLIS
1206            | ConvertedType::TIMESTAMP_MICROS => SortOrder::SIGNED,
1207
1208            ConvertedType::INTERVAL => SortOrder::UNDEFINED,
1209
1210            ConvertedType::LIST | ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => {
1211                SortOrder::UNDEFINED
1212            }
1213
1214            // Fall back to physical type.
1215            ConvertedType::NONE => Self::get_default_sort_order(physical_type),
1216        }
1217    }
1218
1219    /// Returns default sort order based on physical type.
1220    fn get_default_sort_order(physical_type: Type) -> SortOrder {
1221        match physical_type {
1222            // Order: false, true
1223            Type::BOOLEAN => SortOrder::UNSIGNED,
1224            Type::INT32 | Type::INT64 => SortOrder::SIGNED,
1225            Type::INT96 => SortOrder::UNDEFINED,
1226            // Notes to remember when comparing float/double values:
1227            // If the min is a NaN, it should be ignored.
1228            // If the max is a NaN, it should be ignored.
1229            // If the min is +0, the row group may contain -0 values as well.
1230            // If the max is -0, the row group may contain +0 values as well.
1231            // When looking for NaN values, min and max should be ignored.
1232            Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED,
1233            // Unsigned byte-wise comparison
1234            Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED,
1235        }
1236    }
1237
1238    /// Returns sort order associated with this column order.
1239    pub fn sort_order(&self) -> SortOrder {
1240        match *self {
1241            ColumnOrder::TYPE_DEFINED_ORDER(order) => order,
1242            ColumnOrder::UNDEFINED => SortOrder::SIGNED,
1243            ColumnOrder::UNKNOWN => SortOrder::UNDEFINED,
1244        }
1245    }
1246}
1247
1248impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder {
1249    fn read_thrift(prot: &mut R) -> Result<Self> {
1250        let field_ident = prot.read_field_begin(0)?;
1251        if field_ident.field_type == FieldType::Stop {
1252            return Err(general_err!("Received empty union from remote ColumnOrder"));
1253        }
1254        let ret = match field_ident.id {
1255            1 => {
1256                // NOTE: the sort order needs to be set correctly after parsing.
1257                prot.skip_empty_struct()?;
1258                Self::TYPE_DEFINED_ORDER(SortOrder::SIGNED)
1259            }
1260            _ => {
1261                prot.skip(field_ident.field_type)?;
1262                Self::UNKNOWN
1263            }
1264        };
1265        let field_ident = prot.read_field_begin(field_ident.id)?;
1266        if field_ident.field_type != FieldType::Stop {
1267            return Err(general_err!(
1268                "Received multiple fields for union from remote ColumnOrder"
1269            ));
1270        }
1271        Ok(ret)
1272    }
1273}
1274
1275impl WriteThrift for ColumnOrder {
1276    const ELEMENT_TYPE: ElementType = ElementType::Struct;
1277
1278    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
1279        match *self {
1280            Self::TYPE_DEFINED_ORDER(_) => {
1281                writer.write_field_begin(FieldType::Struct, 1, 0)?;
1282                writer.write_struct_end()?;
1283            }
1284            _ => return Err(general_err!("Attempt to write undefined ColumnOrder")),
1285        }
1286        // write end of struct for this union
1287        writer.write_struct_end()
1288    }
1289}
1290
1291// ----------------------------------------------------------------------
1292// Display handlers
1293
1294impl fmt::Display for Compression {
1295    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1296        write!(f, "{self:?}")
1297    }
1298}
1299
1300impl fmt::Display for SortOrder {
1301    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1302        write!(f, "{self:?}")
1303    }
1304}
1305
1306impl fmt::Display for ColumnOrder {
1307    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1308        write!(f, "{self:?}")
1309    }
1310}
1311
1312// ----------------------------------------------------------------------
1313// LogicalType <=> ConvertedType conversion
1314
1315// Note: To prevent type loss when converting from ConvertedType to LogicalType,
1316// the conversion from ConvertedType -> LogicalType is not implemented.
1317// Such type loss includes:
1318// - Not knowing the decimal scale and precision of ConvertedType
1319// - Time and timestamp nanosecond precision, that is not supported in ConvertedType.
1320
1321impl From<Option<LogicalType>> for ConvertedType {
1322    fn from(value: Option<LogicalType>) -> Self {
1323        match value {
1324            Some(value) => match value {
1325                LogicalType::String => ConvertedType::UTF8,
1326                LogicalType::Map => ConvertedType::MAP,
1327                LogicalType::List => ConvertedType::LIST,
1328                LogicalType::Enum => ConvertedType::ENUM,
1329                LogicalType::Decimal { .. } => ConvertedType::DECIMAL,
1330                LogicalType::Date => ConvertedType::DATE,
1331                LogicalType::Time { unit, .. } => match unit {
1332                    TimeUnit::MILLIS => ConvertedType::TIME_MILLIS,
1333                    TimeUnit::MICROS => ConvertedType::TIME_MICROS,
1334                    TimeUnit::NANOS => ConvertedType::NONE,
1335                },
1336                LogicalType::Timestamp { unit, .. } => match unit {
1337                    TimeUnit::MILLIS => ConvertedType::TIMESTAMP_MILLIS,
1338                    TimeUnit::MICROS => ConvertedType::TIMESTAMP_MICROS,
1339                    TimeUnit::NANOS => ConvertedType::NONE,
1340                },
1341                LogicalType::Integer {
1342                    bit_width,
1343                    is_signed,
1344                } => match (bit_width, is_signed) {
1345                    (8, true) => ConvertedType::INT_8,
1346                    (16, true) => ConvertedType::INT_16,
1347                    (32, true) => ConvertedType::INT_32,
1348                    (64, true) => ConvertedType::INT_64,
1349                    (8, false) => ConvertedType::UINT_8,
1350                    (16, false) => ConvertedType::UINT_16,
1351                    (32, false) => ConvertedType::UINT_32,
1352                    (64, false) => ConvertedType::UINT_64,
1353                    (bit_width, is_signed) => panic!(
1354                        "Integer type bit_width={bit_width}, signed={is_signed} is not supported"
1355                    ),
1356                },
1357                LogicalType::Json => ConvertedType::JSON,
1358                LogicalType::Bson => ConvertedType::BSON,
1359                LogicalType::Uuid
1360                | LogicalType::Float16
1361                | LogicalType::Variant { .. }
1362                | LogicalType::Geometry { .. }
1363                | LogicalType::Geography { .. }
1364                | LogicalType::_Unknown { .. }
1365                | LogicalType::Unknown => ConvertedType::NONE,
1366            },
1367            None => ConvertedType::NONE,
1368        }
1369    }
1370}
1371
1372// ----------------------------------------------------------------------
1373// String conversions for schema parsing.
1374
1375impl str::FromStr for Repetition {
1376    type Err = ParquetError;
1377
1378    fn from_str(s: &str) -> Result<Self> {
1379        match s {
1380            "REQUIRED" => Ok(Repetition::REQUIRED),
1381            "OPTIONAL" => Ok(Repetition::OPTIONAL),
1382            "REPEATED" => Ok(Repetition::REPEATED),
1383            other => Err(general_err!("Invalid parquet repetition {}", other)),
1384        }
1385    }
1386}
1387
1388impl str::FromStr for Type {
1389    type Err = ParquetError;
1390
1391    fn from_str(s: &str) -> Result<Self> {
1392        match s {
1393            "BOOLEAN" => Ok(Type::BOOLEAN),
1394            "INT32" => Ok(Type::INT32),
1395            "INT64" => Ok(Type::INT64),
1396            "INT96" => Ok(Type::INT96),
1397            "FLOAT" => Ok(Type::FLOAT),
1398            "DOUBLE" => Ok(Type::DOUBLE),
1399            "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY),
1400            "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY),
1401            other => Err(general_err!("Invalid parquet type {}", other)),
1402        }
1403    }
1404}
1405
1406impl str::FromStr for ConvertedType {
1407    type Err = ParquetError;
1408
1409    fn from_str(s: &str) -> Result<Self> {
1410        match s {
1411            "NONE" => Ok(ConvertedType::NONE),
1412            "UTF8" => Ok(ConvertedType::UTF8),
1413            "MAP" => Ok(ConvertedType::MAP),
1414            "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE),
1415            "LIST" => Ok(ConvertedType::LIST),
1416            "ENUM" => Ok(ConvertedType::ENUM),
1417            "DECIMAL" => Ok(ConvertedType::DECIMAL),
1418            "DATE" => Ok(ConvertedType::DATE),
1419            "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS),
1420            "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS),
1421            "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS),
1422            "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS),
1423            "UINT_8" => Ok(ConvertedType::UINT_8),
1424            "UINT_16" => Ok(ConvertedType::UINT_16),
1425            "UINT_32" => Ok(ConvertedType::UINT_32),
1426            "UINT_64" => Ok(ConvertedType::UINT_64),
1427            "INT_8" => Ok(ConvertedType::INT_8),
1428            "INT_16" => Ok(ConvertedType::INT_16),
1429            "INT_32" => Ok(ConvertedType::INT_32),
1430            "INT_64" => Ok(ConvertedType::INT_64),
1431            "JSON" => Ok(ConvertedType::JSON),
1432            "BSON" => Ok(ConvertedType::BSON),
1433            "INTERVAL" => Ok(ConvertedType::INTERVAL),
1434            other => Err(general_err!("Invalid parquet converted type {}", other)),
1435        }
1436    }
1437}
1438
1439impl str::FromStr for LogicalType {
1440    type Err = ParquetError;
1441
1442    fn from_str(s: &str) -> Result<Self> {
1443        match s {
1444            // The type is a placeholder that gets updated elsewhere
1445            "INTEGER" => Ok(LogicalType::Integer {
1446                bit_width: 8,
1447                is_signed: false,
1448            }),
1449            "MAP" => Ok(LogicalType::Map),
1450            "LIST" => Ok(LogicalType::List),
1451            "ENUM" => Ok(LogicalType::Enum),
1452            "DECIMAL" => Ok(LogicalType::Decimal {
1453                precision: -1,
1454                scale: -1,
1455            }),
1456            "DATE" => Ok(LogicalType::Date),
1457            "TIME" => Ok(LogicalType::Time {
1458                is_adjusted_to_u_t_c: false,
1459                unit: TimeUnit::MILLIS,
1460            }),
1461            "TIMESTAMP" => Ok(LogicalType::Timestamp {
1462                is_adjusted_to_u_t_c: false,
1463                unit: TimeUnit::MILLIS,
1464            }),
1465            "STRING" => Ok(LogicalType::String),
1466            "JSON" => Ok(LogicalType::Json),
1467            "BSON" => Ok(LogicalType::Bson),
1468            "UUID" => Ok(LogicalType::Uuid),
1469            "UNKNOWN" => Ok(LogicalType::Unknown),
1470            "INTERVAL" => Err(general_err!(
1471                "Interval parquet logical type not yet supported"
1472            )),
1473            "FLOAT16" => Ok(LogicalType::Float16),
1474            "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }),
1475            "GEOGRAPHY" => Ok(LogicalType::Geography {
1476                crs: None,
1477                algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
1478            }),
1479            other => Err(general_err!("Invalid parquet logical type {}", other)),
1480        }
1481    }
1482}
1483
1484#[cfg(test)]
1485#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
1486mod tests {
1487    use super::*;
1488    use crate::parquet_thrift::{ThriftSliceInputProtocol, tests::test_roundtrip};
1489
1490    #[test]
1491    fn test_display_type() {
1492        assert_eq!(Type::BOOLEAN.to_string(), "BOOLEAN");
1493        assert_eq!(Type::INT32.to_string(), "INT32");
1494        assert_eq!(Type::INT64.to_string(), "INT64");
1495        assert_eq!(Type::INT96.to_string(), "INT96");
1496        assert_eq!(Type::FLOAT.to_string(), "FLOAT");
1497        assert_eq!(Type::DOUBLE.to_string(), "DOUBLE");
1498        assert_eq!(Type::BYTE_ARRAY.to_string(), "BYTE_ARRAY");
1499        assert_eq!(
1500            Type::FIXED_LEN_BYTE_ARRAY.to_string(),
1501            "FIXED_LEN_BYTE_ARRAY"
1502        );
1503    }
1504
1505    #[test]
1506    fn test_from_string_into_type() {
1507        assert_eq!(
1508            Type::BOOLEAN.to_string().parse::<Type>().unwrap(),
1509            Type::BOOLEAN
1510        );
1511        assert_eq!(
1512            Type::INT32.to_string().parse::<Type>().unwrap(),
1513            Type::INT32
1514        );
1515        assert_eq!(
1516            Type::INT64.to_string().parse::<Type>().unwrap(),
1517            Type::INT64
1518        );
1519        assert_eq!(
1520            Type::INT96.to_string().parse::<Type>().unwrap(),
1521            Type::INT96
1522        );
1523        assert_eq!(
1524            Type::FLOAT.to_string().parse::<Type>().unwrap(),
1525            Type::FLOAT
1526        );
1527        assert_eq!(
1528            Type::DOUBLE.to_string().parse::<Type>().unwrap(),
1529            Type::DOUBLE
1530        );
1531        assert_eq!(
1532            Type::BYTE_ARRAY.to_string().parse::<Type>().unwrap(),
1533            Type::BYTE_ARRAY
1534        );
1535        assert_eq!("BINARY".parse::<Type>().unwrap(), Type::BYTE_ARRAY);
1536        assert_eq!(
1537            Type::FIXED_LEN_BYTE_ARRAY
1538                .to_string()
1539                .parse::<Type>()
1540                .unwrap(),
1541            Type::FIXED_LEN_BYTE_ARRAY
1542        );
1543    }
1544
1545    #[test]
1546    fn test_converted_type_roundtrip() {
1547        test_roundtrip(ConvertedType::UTF8);
1548        test_roundtrip(ConvertedType::MAP);
1549        test_roundtrip(ConvertedType::MAP_KEY_VALUE);
1550        test_roundtrip(ConvertedType::LIST);
1551        test_roundtrip(ConvertedType::ENUM);
1552        test_roundtrip(ConvertedType::DECIMAL);
1553        test_roundtrip(ConvertedType::DATE);
1554        test_roundtrip(ConvertedType::TIME_MILLIS);
1555        test_roundtrip(ConvertedType::TIME_MICROS);
1556        test_roundtrip(ConvertedType::TIMESTAMP_MILLIS);
1557        test_roundtrip(ConvertedType::TIMESTAMP_MICROS);
1558        test_roundtrip(ConvertedType::UINT_8);
1559        test_roundtrip(ConvertedType::UINT_16);
1560        test_roundtrip(ConvertedType::UINT_32);
1561        test_roundtrip(ConvertedType::UINT_64);
1562        test_roundtrip(ConvertedType::INT_8);
1563        test_roundtrip(ConvertedType::INT_16);
1564        test_roundtrip(ConvertedType::INT_32);
1565        test_roundtrip(ConvertedType::INT_64);
1566        test_roundtrip(ConvertedType::JSON);
1567        test_roundtrip(ConvertedType::BSON);
1568        test_roundtrip(ConvertedType::INTERVAL);
1569    }
1570
1571    #[test]
1572    fn test_read_invalid_converted_type() {
1573        let mut prot = ThriftSliceInputProtocol::new(&[0x7eu8]);
1574        let res = ConvertedType::read_thrift(&mut prot);
1575        assert!(res.is_err());
1576        assert_eq!(
1577            res.unwrap_err().to_string(),
1578            "Parquet error: Unexpected ConvertedType 63"
1579        );
1580    }
1581
1582    #[test]
1583    fn test_display_converted_type() {
1584        assert_eq!(ConvertedType::NONE.to_string(), "NONE");
1585        assert_eq!(ConvertedType::UTF8.to_string(), "UTF8");
1586        assert_eq!(ConvertedType::MAP.to_string(), "MAP");
1587        assert_eq!(ConvertedType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE");
1588        assert_eq!(ConvertedType::LIST.to_string(), "LIST");
1589        assert_eq!(ConvertedType::ENUM.to_string(), "ENUM");
1590        assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL");
1591        assert_eq!(ConvertedType::DATE.to_string(), "DATE");
1592        assert_eq!(ConvertedType::TIME_MILLIS.to_string(), "TIME_MILLIS");
1593        assert_eq!(ConvertedType::DATE.to_string(), "DATE");
1594        assert_eq!(ConvertedType::TIME_MICROS.to_string(), "TIME_MICROS");
1595        assert_eq!(
1596            ConvertedType::TIMESTAMP_MILLIS.to_string(),
1597            "TIMESTAMP_MILLIS"
1598        );
1599        assert_eq!(
1600            ConvertedType::TIMESTAMP_MICROS.to_string(),
1601            "TIMESTAMP_MICROS"
1602        );
1603        assert_eq!(ConvertedType::UINT_8.to_string(), "UINT_8");
1604        assert_eq!(ConvertedType::UINT_16.to_string(), "UINT_16");
1605        assert_eq!(ConvertedType::UINT_32.to_string(), "UINT_32");
1606        assert_eq!(ConvertedType::UINT_64.to_string(), "UINT_64");
1607        assert_eq!(ConvertedType::INT_8.to_string(), "INT_8");
1608        assert_eq!(ConvertedType::INT_16.to_string(), "INT_16");
1609        assert_eq!(ConvertedType::INT_32.to_string(), "INT_32");
1610        assert_eq!(ConvertedType::INT_64.to_string(), "INT_64");
1611        assert_eq!(ConvertedType::JSON.to_string(), "JSON");
1612        assert_eq!(ConvertedType::BSON.to_string(), "BSON");
1613        assert_eq!(ConvertedType::INTERVAL.to_string(), "INTERVAL");
1614        assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL")
1615    }
1616
1617    #[test]
1618    fn test_from_string_into_converted_type() {
1619        assert_eq!(
1620            ConvertedType::NONE
1621                .to_string()
1622                .parse::<ConvertedType>()
1623                .unwrap(),
1624            ConvertedType::NONE
1625        );
1626        assert_eq!(
1627            ConvertedType::UTF8
1628                .to_string()
1629                .parse::<ConvertedType>()
1630                .unwrap(),
1631            ConvertedType::UTF8
1632        );
1633        assert_eq!(
1634            ConvertedType::MAP
1635                .to_string()
1636                .parse::<ConvertedType>()
1637                .unwrap(),
1638            ConvertedType::MAP
1639        );
1640        assert_eq!(
1641            ConvertedType::MAP_KEY_VALUE
1642                .to_string()
1643                .parse::<ConvertedType>()
1644                .unwrap(),
1645            ConvertedType::MAP_KEY_VALUE
1646        );
1647        assert_eq!(
1648            ConvertedType::LIST
1649                .to_string()
1650                .parse::<ConvertedType>()
1651                .unwrap(),
1652            ConvertedType::LIST
1653        );
1654        assert_eq!(
1655            ConvertedType::ENUM
1656                .to_string()
1657                .parse::<ConvertedType>()
1658                .unwrap(),
1659            ConvertedType::ENUM
1660        );
1661        assert_eq!(
1662            ConvertedType::DECIMAL
1663                .to_string()
1664                .parse::<ConvertedType>()
1665                .unwrap(),
1666            ConvertedType::DECIMAL
1667        );
1668        assert_eq!(
1669            ConvertedType::DATE
1670                .to_string()
1671                .parse::<ConvertedType>()
1672                .unwrap(),
1673            ConvertedType::DATE
1674        );
1675        assert_eq!(
1676            ConvertedType::TIME_MILLIS
1677                .to_string()
1678                .parse::<ConvertedType>()
1679                .unwrap(),
1680            ConvertedType::TIME_MILLIS
1681        );
1682        assert_eq!(
1683            ConvertedType::TIME_MICROS
1684                .to_string()
1685                .parse::<ConvertedType>()
1686                .unwrap(),
1687            ConvertedType::TIME_MICROS
1688        );
1689        assert_eq!(
1690            ConvertedType::TIMESTAMP_MILLIS
1691                .to_string()
1692                .parse::<ConvertedType>()
1693                .unwrap(),
1694            ConvertedType::TIMESTAMP_MILLIS
1695        );
1696        assert_eq!(
1697            ConvertedType::TIMESTAMP_MICROS
1698                .to_string()
1699                .parse::<ConvertedType>()
1700                .unwrap(),
1701            ConvertedType::TIMESTAMP_MICROS
1702        );
1703        assert_eq!(
1704            ConvertedType::UINT_8
1705                .to_string()
1706                .parse::<ConvertedType>()
1707                .unwrap(),
1708            ConvertedType::UINT_8
1709        );
1710        assert_eq!(
1711            ConvertedType::UINT_16
1712                .to_string()
1713                .parse::<ConvertedType>()
1714                .unwrap(),
1715            ConvertedType::UINT_16
1716        );
1717        assert_eq!(
1718            ConvertedType::UINT_32
1719                .to_string()
1720                .parse::<ConvertedType>()
1721                .unwrap(),
1722            ConvertedType::UINT_32
1723        );
1724        assert_eq!(
1725            ConvertedType::UINT_64
1726                .to_string()
1727                .parse::<ConvertedType>()
1728                .unwrap(),
1729            ConvertedType::UINT_64
1730        );
1731        assert_eq!(
1732            ConvertedType::INT_8
1733                .to_string()
1734                .parse::<ConvertedType>()
1735                .unwrap(),
1736            ConvertedType::INT_8
1737        );
1738        assert_eq!(
1739            ConvertedType::INT_16
1740                .to_string()
1741                .parse::<ConvertedType>()
1742                .unwrap(),
1743            ConvertedType::INT_16
1744        );
1745        assert_eq!(
1746            ConvertedType::INT_32
1747                .to_string()
1748                .parse::<ConvertedType>()
1749                .unwrap(),
1750            ConvertedType::INT_32
1751        );
1752        assert_eq!(
1753            ConvertedType::INT_64
1754                .to_string()
1755                .parse::<ConvertedType>()
1756                .unwrap(),
1757            ConvertedType::INT_64
1758        );
1759        assert_eq!(
1760            ConvertedType::JSON
1761                .to_string()
1762                .parse::<ConvertedType>()
1763                .unwrap(),
1764            ConvertedType::JSON
1765        );
1766        assert_eq!(
1767            ConvertedType::BSON
1768                .to_string()
1769                .parse::<ConvertedType>()
1770                .unwrap(),
1771            ConvertedType::BSON
1772        );
1773        assert_eq!(
1774            ConvertedType::INTERVAL
1775                .to_string()
1776                .parse::<ConvertedType>()
1777                .unwrap(),
1778            ConvertedType::INTERVAL
1779        );
1780        assert_eq!(
1781            ConvertedType::DECIMAL
1782                .to_string()
1783                .parse::<ConvertedType>()
1784                .unwrap(),
1785            ConvertedType::DECIMAL
1786        )
1787    }
1788
1789    #[test]
1790    fn test_logical_to_converted_type() {
1791        let logical_none: Option<LogicalType> = None;
1792        assert_eq!(ConvertedType::from(logical_none), ConvertedType::NONE);
1793        assert_eq!(
1794            ConvertedType::from(Some(LogicalType::Decimal {
1795                precision: 20,
1796                scale: 5
1797            })),
1798            ConvertedType::DECIMAL
1799        );
1800        assert_eq!(
1801            ConvertedType::from(Some(LogicalType::Bson)),
1802            ConvertedType::BSON
1803        );
1804        assert_eq!(
1805            ConvertedType::from(Some(LogicalType::Json)),
1806            ConvertedType::JSON
1807        );
1808        assert_eq!(
1809            ConvertedType::from(Some(LogicalType::String)),
1810            ConvertedType::UTF8
1811        );
1812        assert_eq!(
1813            ConvertedType::from(Some(LogicalType::Date)),
1814            ConvertedType::DATE
1815        );
1816        assert_eq!(
1817            ConvertedType::from(Some(LogicalType::Time {
1818                unit: TimeUnit::MILLIS,
1819                is_adjusted_to_u_t_c: true,
1820            })),
1821            ConvertedType::TIME_MILLIS
1822        );
1823        assert_eq!(
1824            ConvertedType::from(Some(LogicalType::Time {
1825                unit: TimeUnit::MICROS,
1826                is_adjusted_to_u_t_c: true,
1827            })),
1828            ConvertedType::TIME_MICROS
1829        );
1830        assert_eq!(
1831            ConvertedType::from(Some(LogicalType::Time {
1832                unit: TimeUnit::NANOS,
1833                is_adjusted_to_u_t_c: false,
1834            })),
1835            ConvertedType::NONE
1836        );
1837        assert_eq!(
1838            ConvertedType::from(Some(LogicalType::Timestamp {
1839                unit: TimeUnit::MILLIS,
1840                is_adjusted_to_u_t_c: true,
1841            })),
1842            ConvertedType::TIMESTAMP_MILLIS
1843        );
1844        assert_eq!(
1845            ConvertedType::from(Some(LogicalType::Timestamp {
1846                unit: TimeUnit::MICROS,
1847                is_adjusted_to_u_t_c: false,
1848            })),
1849            ConvertedType::TIMESTAMP_MICROS
1850        );
1851        assert_eq!(
1852            ConvertedType::from(Some(LogicalType::Timestamp {
1853                unit: TimeUnit::NANOS,
1854                is_adjusted_to_u_t_c: false,
1855            })),
1856            ConvertedType::NONE
1857        );
1858        assert_eq!(
1859            ConvertedType::from(Some(LogicalType::Integer {
1860                bit_width: 8,
1861                is_signed: false
1862            })),
1863            ConvertedType::UINT_8
1864        );
1865        assert_eq!(
1866            ConvertedType::from(Some(LogicalType::Integer {
1867                bit_width: 8,
1868                is_signed: true
1869            })),
1870            ConvertedType::INT_8
1871        );
1872        assert_eq!(
1873            ConvertedType::from(Some(LogicalType::Integer {
1874                bit_width: 16,
1875                is_signed: false
1876            })),
1877            ConvertedType::UINT_16
1878        );
1879        assert_eq!(
1880            ConvertedType::from(Some(LogicalType::Integer {
1881                bit_width: 16,
1882                is_signed: true
1883            })),
1884            ConvertedType::INT_16
1885        );
1886        assert_eq!(
1887            ConvertedType::from(Some(LogicalType::Integer {
1888                bit_width: 32,
1889                is_signed: false
1890            })),
1891            ConvertedType::UINT_32
1892        );
1893        assert_eq!(
1894            ConvertedType::from(Some(LogicalType::Integer {
1895                bit_width: 32,
1896                is_signed: true
1897            })),
1898            ConvertedType::INT_32
1899        );
1900        assert_eq!(
1901            ConvertedType::from(Some(LogicalType::Integer {
1902                bit_width: 64,
1903                is_signed: false
1904            })),
1905            ConvertedType::UINT_64
1906        );
1907        assert_eq!(
1908            ConvertedType::from(Some(LogicalType::Integer {
1909                bit_width: 64,
1910                is_signed: true
1911            })),
1912            ConvertedType::INT_64
1913        );
1914        assert_eq!(
1915            ConvertedType::from(Some(LogicalType::List)),
1916            ConvertedType::LIST
1917        );
1918        assert_eq!(
1919            ConvertedType::from(Some(LogicalType::Map)),
1920            ConvertedType::MAP
1921        );
1922        assert_eq!(
1923            ConvertedType::from(Some(LogicalType::Uuid)),
1924            ConvertedType::NONE
1925        );
1926        assert_eq!(
1927            ConvertedType::from(Some(LogicalType::Enum)),
1928            ConvertedType::ENUM
1929        );
1930        assert_eq!(
1931            ConvertedType::from(Some(LogicalType::Float16)),
1932            ConvertedType::NONE
1933        );
1934        assert_eq!(
1935            ConvertedType::from(Some(LogicalType::Geometry { crs: None })),
1936            ConvertedType::NONE
1937        );
1938        assert_eq!(
1939            ConvertedType::from(Some(LogicalType::Geography {
1940                crs: None,
1941                algorithm: Some(EdgeInterpolationAlgorithm::default()),
1942            })),
1943            ConvertedType::NONE
1944        );
1945        assert_eq!(
1946            ConvertedType::from(Some(LogicalType::Unknown)),
1947            ConvertedType::NONE
1948        );
1949    }
1950
1951    #[test]
1952    fn test_logical_type_roundtrip() {
1953        test_roundtrip(LogicalType::String);
1954        test_roundtrip(LogicalType::Map);
1955        test_roundtrip(LogicalType::List);
1956        test_roundtrip(LogicalType::Enum);
1957        test_roundtrip(LogicalType::Decimal {
1958            scale: 0,
1959            precision: 20,
1960        });
1961        test_roundtrip(LogicalType::Date);
1962        test_roundtrip(LogicalType::Time {
1963            is_adjusted_to_u_t_c: true,
1964            unit: TimeUnit::MICROS,
1965        });
1966        test_roundtrip(LogicalType::Time {
1967            is_adjusted_to_u_t_c: false,
1968            unit: TimeUnit::MILLIS,
1969        });
1970        test_roundtrip(LogicalType::Time {
1971            is_adjusted_to_u_t_c: false,
1972            unit: TimeUnit::NANOS,
1973        });
1974        test_roundtrip(LogicalType::Timestamp {
1975            is_adjusted_to_u_t_c: false,
1976            unit: TimeUnit::MICROS,
1977        });
1978        test_roundtrip(LogicalType::Timestamp {
1979            is_adjusted_to_u_t_c: true,
1980            unit: TimeUnit::MILLIS,
1981        });
1982        test_roundtrip(LogicalType::Timestamp {
1983            is_adjusted_to_u_t_c: true,
1984            unit: TimeUnit::NANOS,
1985        });
1986        test_roundtrip(LogicalType::Integer {
1987            bit_width: 8,
1988            is_signed: true,
1989        });
1990        test_roundtrip(LogicalType::Integer {
1991            bit_width: 16,
1992            is_signed: false,
1993        });
1994        test_roundtrip(LogicalType::Integer {
1995            bit_width: 32,
1996            is_signed: true,
1997        });
1998        test_roundtrip(LogicalType::Integer {
1999            bit_width: 64,
2000            is_signed: false,
2001        });
2002        test_roundtrip(LogicalType::Json);
2003        test_roundtrip(LogicalType::Bson);
2004        test_roundtrip(LogicalType::Uuid);
2005        test_roundtrip(LogicalType::Float16);
2006        test_roundtrip(LogicalType::Variant {
2007            specification_version: Some(1),
2008        });
2009        test_roundtrip(LogicalType::Variant {
2010            specification_version: None,
2011        });
2012        test_roundtrip(LogicalType::Geometry {
2013            crs: Some("foo".to_owned()),
2014        });
2015        test_roundtrip(LogicalType::Geometry { crs: None });
2016        test_roundtrip(LogicalType::Geography {
2017            crs: Some("foo".to_owned()),
2018            algorithm: Some(EdgeInterpolationAlgorithm::ANDOYER),
2019        });
2020        test_roundtrip(LogicalType::Geography {
2021            crs: None,
2022            algorithm: Some(EdgeInterpolationAlgorithm::KARNEY),
2023        });
2024        test_roundtrip(LogicalType::Geography {
2025            crs: Some("foo".to_owned()),
2026            algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
2027        });
2028        test_roundtrip(LogicalType::Geography {
2029            crs: None,
2030            algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
2031        });
2032    }
2033
2034    #[test]
2035    fn test_display_repetition() {
2036        assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED");
2037        assert_eq!(Repetition::OPTIONAL.to_string(), "OPTIONAL");
2038        assert_eq!(Repetition::REPEATED.to_string(), "REPEATED");
2039    }
2040
2041    #[test]
2042    fn test_from_string_into_repetition() {
2043        assert_eq!(
2044            Repetition::REQUIRED
2045                .to_string()
2046                .parse::<Repetition>()
2047                .unwrap(),
2048            Repetition::REQUIRED
2049        );
2050        assert_eq!(
2051            Repetition::OPTIONAL
2052                .to_string()
2053                .parse::<Repetition>()
2054                .unwrap(),
2055            Repetition::OPTIONAL
2056        );
2057        assert_eq!(
2058            Repetition::REPEATED
2059                .to_string()
2060                .parse::<Repetition>()
2061                .unwrap(),
2062            Repetition::REPEATED
2063        );
2064    }
2065
2066    #[test]
2067    fn test_display_encoding() {
2068        assert_eq!(Encoding::PLAIN.to_string(), "PLAIN");
2069        assert_eq!(Encoding::PLAIN_DICTIONARY.to_string(), "PLAIN_DICTIONARY");
2070        assert_eq!(Encoding::RLE.to_string(), "RLE");
2071        assert_eq!(Encoding::BIT_PACKED.to_string(), "BIT_PACKED");
2072        assert_eq!(
2073            Encoding::DELTA_BINARY_PACKED.to_string(),
2074            "DELTA_BINARY_PACKED"
2075        );
2076        assert_eq!(
2077            Encoding::DELTA_LENGTH_BYTE_ARRAY.to_string(),
2078            "DELTA_LENGTH_BYTE_ARRAY"
2079        );
2080        assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY");
2081        assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY");
2082    }
2083
2084    #[test]
2085    fn test_compression_codec_to_string() {
2086        assert_eq!(Compression::UNCOMPRESSED.codec_to_string(), "UNCOMPRESSED");
2087        assert_eq!(
2088            Compression::ZSTD(ZstdLevel::default()).codec_to_string(),
2089            "ZSTD"
2090        );
2091    }
2092
2093    #[test]
2094    fn test_display_compression() {
2095        assert_eq!(Compression::UNCOMPRESSED.to_string(), "UNCOMPRESSED");
2096        assert_eq!(Compression::SNAPPY.to_string(), "SNAPPY");
2097        assert_eq!(
2098            Compression::GZIP(Default::default()).to_string(),
2099            "GZIP(GzipLevel(6))"
2100        );
2101        assert_eq!(Compression::LZO.to_string(), "LZO");
2102        assert_eq!(
2103            Compression::BROTLI(Default::default()).to_string(),
2104            "BROTLI(BrotliLevel(1))"
2105        );
2106        assert_eq!(Compression::LZ4.to_string(), "LZ4");
2107        assert_eq!(
2108            Compression::ZSTD(Default::default()).to_string(),
2109            "ZSTD(ZstdLevel(1))"
2110        );
2111    }
2112
2113    #[test]
2114    fn test_display_page_type() {
2115        assert_eq!(PageType::DATA_PAGE.to_string(), "DATA_PAGE");
2116        assert_eq!(PageType::INDEX_PAGE.to_string(), "INDEX_PAGE");
2117        assert_eq!(PageType::DICTIONARY_PAGE.to_string(), "DICTIONARY_PAGE");
2118        assert_eq!(PageType::DATA_PAGE_V2.to_string(), "DATA_PAGE_V2");
2119    }
2120
2121    #[test]
2122    fn test_display_sort_order() {
2123        assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED");
2124        assert_eq!(SortOrder::UNSIGNED.to_string(), "UNSIGNED");
2125        assert_eq!(SortOrder::UNDEFINED.to_string(), "UNDEFINED");
2126    }
2127
2128    #[test]
2129    fn test_display_column_order() {
2130        assert_eq!(
2131            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).to_string(),
2132            "TYPE_DEFINED_ORDER(SIGNED)"
2133        );
2134        assert_eq!(
2135            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).to_string(),
2136            "TYPE_DEFINED_ORDER(UNSIGNED)"
2137        );
2138        assert_eq!(
2139            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).to_string(),
2140            "TYPE_DEFINED_ORDER(UNDEFINED)"
2141        );
2142        assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED");
2143    }
2144
2145    #[test]
2146    fn test_column_order_roundtrip() {
2147        // SortOrder::SIGNED is the default on read.
2148        test_roundtrip(ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED))
2149    }
2150
2151    #[test]
2152    fn test_column_order_get_logical_type_sort_order() {
2153        // Helper to check the order in a list of values.
2154        // Only logical type is checked.
2155        fn check_sort_order(types: Vec<LogicalType>, expected_order: SortOrder) {
2156            for tpe in types {
2157                assert_eq!(
2158                    ColumnOrder::get_sort_order(Some(tpe), ConvertedType::NONE, Type::BYTE_ARRAY),
2159                    expected_order
2160                );
2161            }
2162        }
2163
2164        // Unsigned comparison (physical type does not matter)
2165        let unsigned = vec![
2166            LogicalType::String,
2167            LogicalType::Json,
2168            LogicalType::Bson,
2169            LogicalType::Enum,
2170            LogicalType::Uuid,
2171            LogicalType::Integer {
2172                bit_width: 8,
2173                is_signed: false,
2174            },
2175            LogicalType::Integer {
2176                bit_width: 16,
2177                is_signed: false,
2178            },
2179            LogicalType::Integer {
2180                bit_width: 32,
2181                is_signed: false,
2182            },
2183            LogicalType::Integer {
2184                bit_width: 64,
2185                is_signed: false,
2186            },
2187        ];
2188        check_sort_order(unsigned, SortOrder::UNSIGNED);
2189
2190        // Signed comparison (physical type does not matter)
2191        let signed = vec![
2192            LogicalType::Integer {
2193                bit_width: 8,
2194                is_signed: true,
2195            },
2196            LogicalType::Integer {
2197                bit_width: 8,
2198                is_signed: true,
2199            },
2200            LogicalType::Integer {
2201                bit_width: 8,
2202                is_signed: true,
2203            },
2204            LogicalType::Integer {
2205                bit_width: 8,
2206                is_signed: true,
2207            },
2208            LogicalType::Decimal {
2209                scale: 20,
2210                precision: 4,
2211            },
2212            LogicalType::Date,
2213            LogicalType::Time {
2214                is_adjusted_to_u_t_c: false,
2215                unit: TimeUnit::MILLIS,
2216            },
2217            LogicalType::Time {
2218                is_adjusted_to_u_t_c: false,
2219                unit: TimeUnit::MICROS,
2220            },
2221            LogicalType::Time {
2222                is_adjusted_to_u_t_c: true,
2223                unit: TimeUnit::NANOS,
2224            },
2225            LogicalType::Timestamp {
2226                is_adjusted_to_u_t_c: false,
2227                unit: TimeUnit::MILLIS,
2228            },
2229            LogicalType::Timestamp {
2230                is_adjusted_to_u_t_c: false,
2231                unit: TimeUnit::MICROS,
2232            },
2233            LogicalType::Timestamp {
2234                is_adjusted_to_u_t_c: true,
2235                unit: TimeUnit::NANOS,
2236            },
2237            LogicalType::Float16,
2238        ];
2239        check_sort_order(signed, SortOrder::SIGNED);
2240
2241        // Undefined comparison
2242        let undefined = vec![
2243            LogicalType::List,
2244            LogicalType::Map,
2245            LogicalType::Geometry { crs: None },
2246            LogicalType::Geography {
2247                crs: None,
2248                algorithm: Some(EdgeInterpolationAlgorithm::default()),
2249            },
2250        ];
2251        check_sort_order(undefined, SortOrder::UNDEFINED);
2252    }
2253
2254    #[test]
2255    fn test_column_order_get_converted_type_sort_order() {
2256        // Helper to check the order in a list of values.
2257        // Only converted type is checked.
2258        fn check_sort_order(types: Vec<ConvertedType>, expected_order: SortOrder) {
2259            for tpe in types {
2260                assert_eq!(
2261                    ColumnOrder::get_sort_order(None, tpe, Type::BYTE_ARRAY),
2262                    expected_order
2263                );
2264            }
2265        }
2266
2267        // Unsigned comparison (physical type does not matter)
2268        let unsigned = vec![
2269            ConvertedType::UTF8,
2270            ConvertedType::JSON,
2271            ConvertedType::BSON,
2272            ConvertedType::ENUM,
2273            ConvertedType::UINT_8,
2274            ConvertedType::UINT_16,
2275            ConvertedType::UINT_32,
2276            ConvertedType::UINT_64,
2277        ];
2278        check_sort_order(unsigned, SortOrder::UNSIGNED);
2279
2280        // Signed comparison (physical type does not matter)
2281        let signed = vec![
2282            ConvertedType::INT_8,
2283            ConvertedType::INT_16,
2284            ConvertedType::INT_32,
2285            ConvertedType::INT_64,
2286            ConvertedType::DECIMAL,
2287            ConvertedType::DATE,
2288            ConvertedType::TIME_MILLIS,
2289            ConvertedType::TIME_MICROS,
2290            ConvertedType::TIMESTAMP_MILLIS,
2291            ConvertedType::TIMESTAMP_MICROS,
2292        ];
2293        check_sort_order(signed, SortOrder::SIGNED);
2294
2295        // Undefined comparison
2296        let undefined = vec![
2297            ConvertedType::LIST,
2298            ConvertedType::MAP,
2299            ConvertedType::MAP_KEY_VALUE,
2300            ConvertedType::INTERVAL,
2301        ];
2302        check_sort_order(undefined, SortOrder::UNDEFINED);
2303
2304        // Check None logical type
2305        // This should return a sort order for byte array type.
2306        check_sort_order(vec![ConvertedType::NONE], SortOrder::UNSIGNED);
2307    }
2308
2309    #[test]
2310    fn test_column_order_get_default_sort_order() {
2311        // Comparison based on physical type
2312        assert_eq!(
2313            ColumnOrder::get_default_sort_order(Type::BOOLEAN),
2314            SortOrder::UNSIGNED
2315        );
2316        assert_eq!(
2317            ColumnOrder::get_default_sort_order(Type::INT32),
2318            SortOrder::SIGNED
2319        );
2320        assert_eq!(
2321            ColumnOrder::get_default_sort_order(Type::INT64),
2322            SortOrder::SIGNED
2323        );
2324        assert_eq!(
2325            ColumnOrder::get_default_sort_order(Type::INT96),
2326            SortOrder::UNDEFINED
2327        );
2328        assert_eq!(
2329            ColumnOrder::get_default_sort_order(Type::FLOAT),
2330            SortOrder::SIGNED
2331        );
2332        assert_eq!(
2333            ColumnOrder::get_default_sort_order(Type::DOUBLE),
2334            SortOrder::SIGNED
2335        );
2336        assert_eq!(
2337            ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY),
2338            SortOrder::UNSIGNED
2339        );
2340        assert_eq!(
2341            ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY),
2342            SortOrder::UNSIGNED
2343        );
2344    }
2345
2346    #[test]
2347    fn test_column_order_sort_order() {
2348        assert_eq!(
2349            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).sort_order(),
2350            SortOrder::SIGNED
2351        );
2352        assert_eq!(
2353            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).sort_order(),
2354            SortOrder::UNSIGNED
2355        );
2356        assert_eq!(
2357            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).sort_order(),
2358            SortOrder::UNDEFINED
2359        );
2360        assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED);
2361    }
2362
2363    #[test]
2364    fn test_parse_encoding() {
2365        let mut encoding: Encoding = "PLAIN".parse().unwrap();
2366        assert_eq!(encoding, Encoding::PLAIN);
2367        encoding = "PLAIN_DICTIONARY".parse().unwrap();
2368        assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
2369        encoding = "RLE".parse().unwrap();
2370        assert_eq!(encoding, Encoding::RLE);
2371        encoding = "BIT_PACKED".parse().unwrap();
2372        assert_eq!(encoding, Encoding::BIT_PACKED);
2373        encoding = "DELTA_BINARY_PACKED".parse().unwrap();
2374        assert_eq!(encoding, Encoding::DELTA_BINARY_PACKED);
2375        encoding = "DELTA_LENGTH_BYTE_ARRAY".parse().unwrap();
2376        assert_eq!(encoding, Encoding::DELTA_LENGTH_BYTE_ARRAY);
2377        encoding = "DELTA_BYTE_ARRAY".parse().unwrap();
2378        assert_eq!(encoding, Encoding::DELTA_BYTE_ARRAY);
2379        encoding = "RLE_DICTIONARY".parse().unwrap();
2380        assert_eq!(encoding, Encoding::RLE_DICTIONARY);
2381        encoding = "BYTE_STREAM_SPLIT".parse().unwrap();
2382        assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
2383
2384        // test lowercase
2385        encoding = "byte_stream_split".parse().unwrap();
2386        assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
2387
2388        // test unknown string
2389        match "plain_xxx".parse::<Encoding>() {
2390            Ok(e) => {
2391                panic!("Should not be able to parse {e:?}");
2392            }
2393            Err(e) => {
2394                assert_eq!(e.to_string(), "Parquet error: unknown encoding: plain_xxx");
2395            }
2396        }
2397    }
2398
2399    #[test]
2400    fn test_parse_compression() {
2401        let mut compress: Compression = "snappy".parse().unwrap();
2402        assert_eq!(compress, Compression::SNAPPY);
2403        compress = "lzo".parse().unwrap();
2404        assert_eq!(compress, Compression::LZO);
2405        compress = "zstd(3)".parse().unwrap();
2406        assert_eq!(compress, Compression::ZSTD(ZstdLevel::try_new(3).unwrap()));
2407        compress = "LZ4_RAW".parse().unwrap();
2408        assert_eq!(compress, Compression::LZ4_RAW);
2409        compress = "uncompressed".parse().unwrap();
2410        assert_eq!(compress, Compression::UNCOMPRESSED);
2411        compress = "snappy".parse().unwrap();
2412        assert_eq!(compress, Compression::SNAPPY);
2413        compress = "gzip(9)".parse().unwrap();
2414        assert_eq!(compress, Compression::GZIP(GzipLevel::try_new(9).unwrap()));
2415        compress = "lzo".parse().unwrap();
2416        assert_eq!(compress, Compression::LZO);
2417        compress = "brotli(3)".parse().unwrap();
2418        assert_eq!(
2419            compress,
2420            Compression::BROTLI(BrotliLevel::try_new(3).unwrap())
2421        );
2422        compress = "lz4".parse().unwrap();
2423        assert_eq!(compress, Compression::LZ4);
2424
2425        // test unknown compression
2426        let mut err = "plain_xxx".parse::<Encoding>().unwrap_err();
2427        assert_eq!(
2428            err.to_string(),
2429            "Parquet error: unknown encoding: plain_xxx"
2430        );
2431
2432        // test invalid compress level
2433        err = "gzip(-10)".parse::<Encoding>().unwrap_err();
2434        assert_eq!(
2435            err.to_string(),
2436            "Parquet error: unknown encoding: gzip(-10)"
2437        );
2438    }
2439
2440    #[test]
2441    fn test_display_boundary_order() {
2442        assert_eq!(BoundaryOrder::ASCENDING.to_string(), "ASCENDING");
2443        assert_eq!(BoundaryOrder::DESCENDING.to_string(), "DESCENDING");
2444        assert_eq!(BoundaryOrder::UNORDERED.to_string(), "UNORDERED");
2445    }
2446
2447    #[test]
2448    fn test_display_edge_algo() {
2449        assert_eq!(
2450            EdgeInterpolationAlgorithm::SPHERICAL.to_string(),
2451            "SPHERICAL"
2452        );
2453        assert_eq!(EdgeInterpolationAlgorithm::VINCENTY.to_string(), "VINCENTY");
2454        assert_eq!(EdgeInterpolationAlgorithm::THOMAS.to_string(), "THOMAS");
2455        assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER");
2456        assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY");
2457    }
2458
2459    fn encodings_roundtrip(mut encodings: Vec<Encoding>) {
2460        encodings.sort();
2461        let mask = EncodingMask::new_from_encodings(encodings.iter());
2462        assert!(mask.all_set(encodings.iter()));
2463        let v = mask.encodings().collect::<Vec<_>>();
2464        assert_eq!(v, encodings);
2465    }
2466
2467    #[test]
2468    fn test_encoding_roundtrip() {
2469        encodings_roundtrip(
2470            [
2471                Encoding::RLE,
2472                Encoding::PLAIN,
2473                Encoding::DELTA_BINARY_PACKED,
2474            ]
2475            .into(),
2476        );
2477        encodings_roundtrip([Encoding::RLE_DICTIONARY, Encoding::PLAIN_DICTIONARY].into());
2478        encodings_roundtrip([].into());
2479        let encodings = [
2480            Encoding::PLAIN,
2481            Encoding::BIT_PACKED,
2482            Encoding::RLE,
2483            Encoding::DELTA_BINARY_PACKED,
2484            Encoding::DELTA_BYTE_ARRAY,
2485            Encoding::DELTA_LENGTH_BYTE_ARRAY,
2486            Encoding::PLAIN_DICTIONARY,
2487            Encoding::RLE_DICTIONARY,
2488            Encoding::BYTE_STREAM_SPLIT,
2489        ];
2490        encodings_roundtrip(encodings.into());
2491    }
2492
2493    #[test]
2494    fn test_invalid_encoding_mask() {
2495        // any set bits higher than the max should trigger an error
2496        let res = EncodingMask::try_new(-1);
2497        assert!(res.is_err());
2498        let err = res.unwrap_err();
2499        assert_eq!(
2500            err.to_string(),
2501            "Parquet error: Attempt to create invalid mask: 0xffffffff"
2502        );
2503
2504        // test that GROUP_VAR_INT is disallowed
2505        let res = EncodingMask::try_new(2);
2506        assert!(res.is_err());
2507        let err = res.unwrap_err();
2508        assert_eq!(
2509            err.to_string(),
2510            "Parquet error: Attempt to create invalid mask: 0x2"
2511        );
2512    }
2513}