Skip to main content

parquet/
basic.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains Rust mappings for Thrift definition. This module contains only mappings for thrift
19//! enums and unions. Thrift structs are handled elsewhere.
20//! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift)
21//! file to see raw definitions.
22
23use std::io::Write;
24use std::str::FromStr;
25use std::{fmt, str};
26
27pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
28use crate::file::metadata::HeapSize;
29use crate::parquet_thrift::{
30    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
31    WriteThrift, WriteThriftField, validate_list_type,
32};
33use crate::{
34    thrift_enum, thrift_struct, thrift_union_all_empty, thrift_union_with_unknown,
35    write_thrift_field,
36};
37
38use crate::errors::{ParquetError, Result};
39
40// ----------------------------------------------------------------------
41// Types from the Thrift definition
42
43// ----------------------------------------------------------------------
44// Mirrors thrift enum `Type`
45
46thrift_enum!(
47/// Types supported by Parquet.
48///
49/// These physical types are intended to be used in combination with the encodings to
50/// control the on disk storage format.
51/// For example INT16 is not included as a type since a good encoding of INT32
52/// would handle this.
53enum Type {
54  BOOLEAN = 0;
55  INT32 = 1;
56  INT64 = 2;
57  INT96 = 3;  // deprecated, only used by legacy implementations.
58  FLOAT = 4;
59  DOUBLE = 5;
60  BYTE_ARRAY = 6;
61  FIXED_LEN_BYTE_ARRAY = 7;
62}
63);
64
65// ----------------------------------------------------------------------
66// Mirrors thrift enum `ConvertedType`
67
68// TODO(ets): Adding the `NONE` variant to this enum is a bit awkward. We should
69// look into removing it and using `Option<ConvertedType>` instead.
70thrift_enum!(
71/// Common types (converted types) used by frameworks when using Parquet.
72///
73/// This helps map between types in those frameworks to the base types in Parquet.
74/// This is only metadata and not needed to read or write the data.
75///
76/// This struct was renamed from `LogicalType` in version 4.0.0.
77/// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead.
78enum ConvertedType {
79  /// Not defined in the spec, used internally to indicate no type conversion
80  NONE = -1;
81
82  /// A BYTE_ARRAY actually contains UTF8 encoded chars.
83  UTF8 = 0;
84
85  /// A map is converted as an optional field containing a repeated key/value pair.
86  MAP = 1;
87
88  /// A key/value pair is converted into a group of two fields.
89  MAP_KEY_VALUE = 2;
90
91  /// A list is converted into an optional field containing a repeated field for its
92  /// values.
93  LIST = 3;
94
95  /// An enum is converted into a BYTE_ARRAY field
96  ENUM = 4;
97
98  /// A decimal value.
99  ///
100  /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive
101  /// types. The underlying byte array stores the unscaled value encoded as two's
102  /// complement using big-endian byte order (the most significant byte is the
103  /// zeroth element). The value of the decimal is the value * 10^{-scale}.
104  ///
105  /// This must be accompanied by a (maximum) precision and a scale in the
106  /// SchemaElement. The precision specifies the number of digits in the decimal
107  /// and the scale stores the location of the decimal point. For example 1.23
108  /// would have precision 3 (3 total digits) and scale 2 (the decimal point is
109  /// 2 digits over).
110  DECIMAL = 5;
111
112  /// A date stored as days since Unix epoch, encoded as the INT32 physical type.
113  DATE = 6;
114
115  /// The total number of milliseconds since midnight. The value is stored as an INT32
116  /// physical type.
117  TIME_MILLIS = 7;
118
119  /// The total number of microseconds since midnight. The value is stored as an INT64
120  /// physical type.
121  TIME_MICROS = 8;
122
123  /// Date and time recorded as milliseconds since the Unix epoch.
124  /// Recorded as a physical type of INT64.
125  TIMESTAMP_MILLIS = 9;
126
127  /// Date and time recorded as microseconds since the Unix epoch.
128  /// The value is stored as an INT64 physical type.
129  TIMESTAMP_MICROS = 10;
130
131  /// An unsigned 8 bit integer value stored as INT32 physical type.
132  UINT_8 = 11;
133
134  /// An unsigned 16 bit integer value stored as INT32 physical type.
135  UINT_16 = 12;
136
137  /// An unsigned 32 bit integer value stored as INT32 physical type.
138  UINT_32 = 13;
139
140  /// An unsigned 64 bit integer value stored as INT64 physical type.
141  UINT_64 = 14;
142
143  /// A signed 8 bit integer value stored as INT32 physical type.
144  INT_8 = 15;
145
146  /// A signed 16 bit integer value stored as INT32 physical type.
147  INT_16 = 16;
148
149  /// A signed 32 bit integer value stored as INT32 physical type.
150  INT_32 = 17;
151
152  /// A signed 64 bit integer value stored as INT64 physical type.
153  INT_64 = 18;
154
155  /// A JSON document embedded within a single UTF8 column.
156  JSON = 19;
157
158   /// A BSON document embedded within a single BINARY column.
159  BSON = 20;
160
161  /// An interval of time
162  ///
163  /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12.
164  /// This data is composed of three separate little endian unsigned integers.
165  /// Each stores a component of a duration of time. The first integer identifies
166  /// the number of months associated with the duration, the second identifies
167  /// the number of days associated with the duration and the third identifies
168  /// the number of milliseconds associated with the provided duration.
169  /// This duration of time is independent of any particular timezone or date.
170  INTERVAL = 21;
171}
172);
173
174// ----------------------------------------------------------------------
175// Mirrors thrift union `TimeUnit`
176
177thrift_union_all_empty!(
178/// Time unit for `Time` and `Timestamp` logical types.
179union TimeUnit {
180  1: MilliSeconds MILLIS
181  2: MicroSeconds MICROS
182  3: NanoSeconds NANOS
183}
184);
185
186// ----------------------------------------------------------------------
187// Mirrors thrift union `LogicalType`
188
189thrift_struct!(
190pub struct DecimalType {
191  /// The number of digits in the decimal.
192  1: required i32 scale
193  /// The location of the decimal point.
194  2: required i32 precision
195}
196);
197
198thrift_struct!(
199pub struct TimestampType {
200  /// Whether the timestamp is adjusted to UTC.
201  1: required bool is_adjusted_to_u_t_c
202  /// The unit of time.
203  2: required TimeUnit unit
204}
205);
206
207/// Identical to [`TimestampType`]
208pub use TimestampType as TimeType;
209
210thrift_struct!(
211pub struct IntType {
212  /// The number of bits in the integer.
213  1: required i8 bit_width
214  /// Whether the integer is signed.
215  2: required bool is_signed
216}
217);
218
219thrift_struct!(
220pub struct VariantType {
221  /// The version of the variant specification that the variant was
222  /// written with.
223  1: optional i8 specification_version
224}
225);
226
227thrift_struct!(
228pub struct GeometryType {
229  /// A custom CRS. If unset the CRS `OGC:CRS84` should be used, which means that the geometries
230  /// must be stored in longitude, latitude based on the WGS84 datum.
231  1: optional string crs;
232}
233);
234
235thrift_struct!(
236pub struct GeographyType {
237  /// A custom CRS. If unset the CRS `OGC:CRS84` should be used.
238  1: optional string crs;
239  /// An optional algorithm can be set to correctly interpret edges interpolation
240  /// of the geometries. If unset, the `SPHERICAL` algorithm should be used.
241  2: optional EdgeInterpolationAlgorithm algorithm;
242}
243);
244
245impl GeographyType {
246    /// Accessor for the `GeographyType::algorithm` field. If this field is not set, this
247    /// function returns the default value (currently [`EdgeInterpolationAlgorithm::SPHERICAL`]
248    /// per the Parquet [specification]).
249    ///
250    /// [specification]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography
251    pub fn algorithm(&self) -> Option<EdgeInterpolationAlgorithm> {
252        self.algorithm.or(Some(Default::default()))
253    }
254}
255
256thrift_union_with_unknown!(
257/// Logical types used by version 2.4.0+ of the Parquet format.
258///
259/// This is an *entirely new* struct as of version
260/// 4.0.0. The struct previously named `LogicalType` was renamed to
261/// [`ConvertedType`]. Please see the README.md for more details.
262union LogicalType {
263   /// A UTF8 encoded string.
264   1:  String
265   /// A map of key-value pairs.
266   2:  Map
267   /// A list of elements.
268   3:  List
269   /// A set of predefined values.
270   4:  Enum
271   /// A decimal value with a specified scale and precision.
272   5:  (DecimalType) Decimal
273   /// A date stored as days since Unix epoch.
274   6:  Date
275   /// A time stored as [`TimeUnit`] since midnight.
276   7:  (TimeType) Time
277   /// A timestamp stored as [`TimeUnit`] since Unix epoch.
278   8:  (TimestampType) Timestamp
279   // 9: reserved for INTERVAL
280   /// An integer with a specified bit width and signedness.
281   10: (IntType) Integer
282   /// An unknown logical type.
283   11: Unknown
284   /// A JSON document.
285   12: Json
286   /// A BSON document.
287   13: Bson
288   /// A UUID.
289   14: Uuid
290   /// A 16-bit floating point number.
291   15: Float16
292   /// A Variant value.
293   16: (VariantType) Variant
294   /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation.
295   17: (GeometryType) Geometry
296   /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation.
297   18: (GeographyType) Geography
298}
299);
300
301impl LogicalType {
302    /// Create a [`LogicalType::Integer`] variant with the given `bit_width` and `is_signed`
303    pub fn integer(bit_width: i8, is_signed: bool) -> Self {
304        Self::Integer(IntType {
305            bit_width,
306            is_signed,
307        })
308    }
309
310    /// Create a [`LogicalType::Decimal`] variant with the given `scale` and `precision`
311    pub fn decimal(scale: i32, precision: i32) -> Self {
312        Self::Decimal(DecimalType { scale, precision })
313    }
314
315    /// Create a [`LogicalType::Time`] variant with the given `is_adjusted_to_u_t_c` and `unit`
316    pub fn time(is_adjusted_to_u_t_c: bool, unit: TimeUnit) -> Self {
317        Self::Time(TimeType {
318            is_adjusted_to_u_t_c,
319            unit,
320        })
321    }
322
323    /// Create a [`LogicalType::Timestamp`] variant with the given `is_adjusted_to_u_t_c` and `unit`
324    pub fn timestamp(is_adjusted_to_u_t_c: bool, unit: TimeUnit) -> Self {
325        Self::Timestamp(TimestampType {
326            is_adjusted_to_u_t_c,
327            unit,
328        })
329    }
330
331    /// Create a [`LogicalType::Variant`] variant with the given `specification_version`
332    pub fn variant(specification_version: Option<i8>) -> Self {
333        Self::Variant(VariantType {
334            specification_version,
335        })
336    }
337
338    /// Create a [`LogicalType::Geometry`] variant with the given `crs`
339    pub fn geometry(crs: Option<String>) -> Self {
340        Self::Geometry(GeometryType { crs })
341    }
342
343    /// Create a [`LogicalType::Geography`] variant with the given `crs` and `algorithm`
344    pub fn geography(crs: Option<String>, algorithm: Option<EdgeInterpolationAlgorithm>) -> Self {
345        Self::Geography(GeographyType { crs, algorithm })
346    }
347}
348
349// ----------------------------------------------------------------------
350// Mirrors thrift enum `FieldRepetitionType`
351//
352
353thrift_enum!(
354/// Representation of field types in schema.
355enum FieldRepetitionType {
356  /// This field is required (can not be null) and each row has exactly 1 value.
357  REQUIRED = 0;
358  /// The field is optional (can be null) and each row has 0 or 1 values.
359  OPTIONAL = 1;
360  /// The field is repeated and can contain 0 or more values.
361  REPEATED = 2;
362}
363);
364
365/// Type alias for thrift `FieldRepetitionType`
366pub type Repetition = FieldRepetitionType;
367
368// ----------------------------------------------------------------------
369// Mirrors thrift enum `Encoding`
370
371thrift_enum!(
372/// Encodings supported by Parquet.
373///
374/// Not all encodings are valid for all types. These enums are also used to specify the
375/// encoding of definition and repetition levels.
376///
377/// By default this crate uses [Encoding::PLAIN], [Encoding::RLE], and [Encoding::RLE_DICTIONARY].
378/// These provide very good encode and decode performance, whilst yielding reasonable storage
379/// efficiency and being supported by all major parquet readers.
380///
381/// The delta encodings are also supported and will be used if a newer [WriterVersion] is
382/// configured, however, it should be noted that these sacrifice encode and decode performance for
383/// improved storage efficiency. This performance regression is particularly pronounced in the case
384/// of record skipping as occurs during predicate push-down. It is recommended users assess the
385/// performance impact when evaluating these encodings.
386///
387/// [WriterVersion]: crate::file::properties::WriterVersion
388enum Encoding {
389  /// Default encoding.
390  /// - BOOLEAN - 1 bit per value. 0 is false; 1 is true.
391  /// - INT32 - 4 bytes per value.  Stored as little-endian.
392  /// - INT64 - 8 bytes per value.  Stored as little-endian.
393  /// - FLOAT - 4 bytes per value.  IEEE. Stored as little-endian.
394  /// - DOUBLE - 8 bytes per value.  IEEE. Stored as little-endian.
395  /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
396  /// - FIXED_LEN_BYTE_ARRAY - Just the bytes.
397  PLAIN = 0;
398  //  GROUP_VAR_INT = 1;
399  /// **Deprecated** dictionary encoding.
400  ///
401  /// The values in the dictionary are encoded using PLAIN encoding.
402  /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and
403  /// PLAIN encoding is used for dictionary page.
404  PLAIN_DICTIONARY = 2;
405  /// Group packed run length encoding.
406  ///
407  /// Usable for definition/repetition levels encoding and boolean values.
408  RLE = 3;
409  /// **Deprecated** Bit-packed encoding.
410  ///
411  /// This can only be used if the data has a known max width.
412  /// Usable for definition/repetition levels encoding.
413  ///
414  /// There are compatibility issues with files using this encoding.
415  /// The parquet standard specifies the bits to be packed starting from the
416  /// most-significant bit, several implementations do not follow this bit order.
417  /// Several other implementations also have issues reading this encoding
418  /// because of incorrect assumptions about the length of the encoded data.
419  ///
420  /// The RLE/bit-packing hybrid is more cpu and memory efficient and should be used instead.
421  #[deprecated(
422      since = "51.0.0",
423      note = "Please see documentation for compatibility issues and use the RLE/bit-packing hybrid encoding instead"
424  )]
425  BIT_PACKED = 4;
426  /// Delta encoding for integers, either INT32 or INT64.
427  ///
428  /// Works best on sorted data.
429  DELTA_BINARY_PACKED = 5;
430  /// Encoding for byte arrays to separate the length values and the data.
431  ///
432  /// The lengths are encoded using DELTA_BINARY_PACKED encoding.
433  DELTA_LENGTH_BYTE_ARRAY = 6;
434  /// Incremental encoding for byte arrays.
435  ///
436  /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding.
437  /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding.
438  DELTA_BYTE_ARRAY = 7;
439  /// Dictionary encoding.
440  ///
441  /// The ids are encoded using the RLE encoding.
442  RLE_DICTIONARY = 8;
443  /// Encoding for fixed-width data.
444  ///
445  /// K byte-streams are created where K is the size in bytes of the data type.
446  /// The individual bytes of a value are scattered to the corresponding stream and
447  /// the streams are concatenated.
448  /// This itself does not reduce the size of the data but can lead to better compression
449  /// afterwards. Note that the use of this encoding with FIXED_LEN_BYTE_ARRAY(N) data may
450  /// perform poorly for large values of N.
451  BYTE_STREAM_SPLIT = 9;
452}
453);
454
455impl FromStr for Encoding {
456    type Err = ParquetError;
457
458    fn from_str(s: &str) -> Result<Self, Self::Err> {
459        match s {
460            "PLAIN" | "plain" => Ok(Encoding::PLAIN),
461            "PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY),
462            "RLE" | "rle" => Ok(Encoding::RLE),
463            #[allow(deprecated)]
464            "BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
465            "DELTA_BINARY_PACKED" | "delta_binary_packed" => Ok(Encoding::DELTA_BINARY_PACKED),
466            "DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
467                Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY)
468            }
469            "DELTA_BYTE_ARRAY" | "delta_byte_array" => Ok(Encoding::DELTA_BYTE_ARRAY),
470            "RLE_DICTIONARY" | "rle_dictionary" => Ok(Encoding::RLE_DICTIONARY),
471            "BYTE_STREAM_SPLIT" | "byte_stream_split" => Ok(Encoding::BYTE_STREAM_SPLIT),
472            _ => Err(general_err!("unknown encoding: {}", s)),
473        }
474    }
475}
476
477/// A bitmask representing the [`Encoding`]s employed while encoding a Parquet column chunk.
478///
479/// The Parquet [`ColumnMetaData`] struct contains an array that indicates what encodings were
480/// used when writing that column chunk. For memory and performance reasons, this crate reduces
481/// that array to bitmask, where each bit position represents a different [`Encoding`]. This
482/// struct contains that bitmask, and provides methods to interact with the data.
483///
484/// # Example
485/// ```no_run
486/// # use parquet::file::metadata::ParquetMetaDataReader;
487/// # use parquet::basic::Encoding;
488/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
489/// // read parquet metadata from a file
490/// let file = open_parquet_file("some_path.parquet");
491/// let mut reader = ParquetMetaDataReader::new();
492/// reader.try_parse(&file).unwrap();
493/// let metadata = reader.finish().unwrap();
494///
495/// // find the encodings used by the first column chunk in the first row group
496/// let col_meta = metadata.row_group(0).column(0);
497/// let encodings = col_meta.encodings_mask();
498///
499/// // check to see if a particular encoding was used
500/// let used_rle = encodings.is_set(Encoding::RLE);
501///
502/// // check to see if all of a set of encodings were used
503/// let used_all = encodings.all_set([Encoding::RLE, Encoding::PLAIN].iter());
504///
505/// // convert mask to a Vec<Encoding>
506/// let encodings_vec = encodings.encodings().collect::<Vec<_>>();
507/// ```
508///
509/// [`ColumnMetaData`]: https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/src/main/thrift/parquet.thrift#L875
510#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
511pub struct EncodingMask(i32);
512
513impl EncodingMask {
514    /// Highest valued discriminant in the [`Encoding`] enum
515    const MAX_ENCODING: i32 = Encoding::MAX_DISCRIMINANT;
516    /// A mask consisting of unused bit positions, used for validation. This includes the never
517    /// used GROUP_VAR_INT encoding value of `1`.
518    const ALLOWED_MASK: u32 =
519        !(1u32 << (EncodingMask::MAX_ENCODING as u32 + 1)).wrapping_sub(1) | 1 << 1;
520
521    /// Attempt to create a new `EncodingMask` from an integer.
522    ///
523    /// This will return an error if a bit outside the allowable range is set.
524    pub fn try_new(val: i32) -> Result<Self> {
525        if val as u32 & Self::ALLOWED_MASK != 0 {
526            return Err(general_err!("Attempt to create invalid mask: 0x{:x}", val));
527        }
528        Ok(Self(val))
529    }
530
531    /// Return an integer representation of this `EncodingMask`.
532    pub fn as_i32(&self) -> i32 {
533        self.0
534    }
535
536    /// Create a new `EncodingMask` from a collection of [`Encoding`]s.
537    pub fn new_from_encodings<'a>(encodings: impl Iterator<Item = &'a Encoding>) -> Self {
538        let mut mask = 0;
539        for &e in encodings {
540            mask |= 1 << (e as i32);
541        }
542        Self(mask)
543    }
544
545    /// Mark the given [`Encoding`] as present in this mask.
546    pub fn insert(&mut self, val: Encoding) {
547        self.0 |= 1 << (val as i32);
548    }
549
550    /// Test if a given [`Encoding`] is present in this mask.
551    pub fn is_set(&self, val: Encoding) -> bool {
552        self.0 & (1 << (val as i32)) != 0
553    }
554
555    /// Test if this mask has only the bit for the given [`Encoding`] set.
556    pub fn is_only(&self, val: Encoding) -> bool {
557        self.0 == (1 << (val as i32))
558    }
559
560    /// Test if all [`Encoding`]s in a given set are present in this mask.
561    pub fn all_set<'a>(&self, mut encodings: impl Iterator<Item = &'a Encoding>) -> bool {
562        encodings.all(|&e| self.is_set(e))
563    }
564
565    /// Return an iterator over all [`Encoding`]s present in this mask.
566    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
567        Self::mask_to_encodings_iter(self.0)
568    }
569
570    fn mask_to_encodings_iter(mask: i32) -> impl Iterator<Item = Encoding> {
571        (0..=Self::MAX_ENCODING)
572            .filter(move |i| mask & (1 << i) != 0)
573            .map(i32_to_encoding)
574    }
575}
576
577impl HeapSize for EncodingMask {
578    fn heap_size(&self) -> usize {
579        0 // no heap allocations
580    }
581}
582
583impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EncodingMask {
584    fn read_thrift(prot: &mut R) -> Result<Self> {
585        let mut mask = 0;
586
587        // This reads a Thrift `list<Encoding>` and turns it into a bitmask
588        let list_ident = prot.read_list_begin()?;
589        // check for enum (encoded as I32)
590        validate_list_type(ElementType::I32, &list_ident)?;
591        for _ in 0..list_ident.size {
592            let val = Encoding::read_thrift(prot)?;
593            mask |= 1 << val as i32;
594        }
595        Ok(Self(mask))
596    }
597}
598
599#[allow(deprecated)]
600fn i32_to_encoding(val: i32) -> Encoding {
601    match val {
602        0 => Encoding::PLAIN,
603        2 => Encoding::PLAIN_DICTIONARY,
604        3 => Encoding::RLE,
605        4 => Encoding::BIT_PACKED,
606        5 => Encoding::DELTA_BINARY_PACKED,
607        6 => Encoding::DELTA_LENGTH_BYTE_ARRAY,
608        7 => Encoding::DELTA_BYTE_ARRAY,
609        8 => Encoding::RLE_DICTIONARY,
610        9 => Encoding::BYTE_STREAM_SPLIT,
611        _ => panic!("Impossible encoding {val}"),
612    }
613}
614
615// ----------------------------------------------------------------------
616// Mirrors thrift enum `CompressionCodec`
617
618thrift_enum!(
619/// Supported compression algorithms.
620///
621/// Codecs added in format version X.Y can be read by readers based on X.Y and later.
622/// Codec support may vary between readers based on the format version and
623/// libraries available at runtime.
624///
625/// See [Compression.md] for a detailed specification of these algorithms.
626///
627/// [Compression.md]: https://github.com/apache/parquet-format/blob/master/Compression.md
628enum CompressionCodec {
629  UNCOMPRESSED = 0;
630  SNAPPY = 1;
631  GZIP = 2;
632  LZO = 3;
633  BROTLI = 4;  // Added in 2.4
634  LZ4 = 5;     // DEPRECATED (Added in 2.4)
635  ZSTD = 6;    // Added in 2.4
636  LZ4_RAW = 7; // Added in 2.9
637}
638);
639
640// NOTE: This enum likely belongs in file::properties now, but moving it there would be a
641// breaking API change, that's probably not worth the pain. If a new codec is added to the
642// Parquet specification, or any other breaking changes are made to this enum, this can be
643// revisited.
644
645/// Supported block compression algorithms.
646///
647/// Block compression can yield non-trivial improvements to storage efficiency at the expense
648/// of potentially significantly worse encode and decode performance. Many applications,
649/// especially those making use of high-throughput and low-cost commodity object storage,
650/// may find storage efficiency less important than decode throughput, and therefore may
651/// wish to not make use of block compression.
652///
653/// The writers in this crate default to no block compression for this reason.
654///
655/// Applications that do still wish to use block compression, will find [`Compression::ZSTD`]
656/// to provide a good balance of compression, performance, and ecosystem support. Alternatively,
657/// [`Compression::LZ4_RAW`] provides much faster decompression speeds, at the cost of typically
658/// worse compression ratios. However, it is not as widely supported by the ecosystem, with the
659/// Hadoop ecosystem historically favoring the non-standard and now deprecated [`Compression::LZ4`].
660#[derive(Debug, Clone, Copy, PartialEq, Eq)]
661#[allow(non_camel_case_types)]
662pub enum Compression {
663    /// No compression.
664    UNCOMPRESSED,
665    /// [Snappy compression](https://en.wikipedia.org/wiki/Snappy_(compression))
666    SNAPPY,
667    /// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
668    GZIP(GzipLevel),
669    /// [LZO compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
670    LZO,
671    /// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
672    BROTLI(BrotliLevel),
673    /// [LZ4 compression](https://lz4.org/), [(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
674    LZ4,
675    /// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
676    ZSTD(ZstdLevel),
677    /// [LZ4 compression](https://lz4.org/).
678    LZ4_RAW,
679}
680
681impl From<CompressionCodec> for Compression {
682    fn from(value: CompressionCodec) -> Self {
683        match value {
684            CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED,
685            CompressionCodec::SNAPPY => Compression::SNAPPY,
686            CompressionCodec::GZIP => Compression::GZIP(Default::default()),
687            CompressionCodec::LZO => Compression::LZO,
688            CompressionCodec::BROTLI => Compression::BROTLI(Default::default()),
689            CompressionCodec::LZ4 => Compression::LZ4,
690            CompressionCodec::ZSTD => Compression::ZSTD(Default::default()),
691            CompressionCodec::LZ4_RAW => Compression::LZ4_RAW,
692        }
693    }
694}
695
696impl From<Compression> for CompressionCodec {
697    fn from(value: Compression) -> Self {
698        match value {
699            Compression::UNCOMPRESSED => CompressionCodec::UNCOMPRESSED,
700            Compression::SNAPPY => CompressionCodec::SNAPPY,
701            Compression::GZIP(_) => CompressionCodec::GZIP,
702            Compression::LZO => CompressionCodec::LZO,
703            Compression::BROTLI(_) => CompressionCodec::BROTLI,
704            Compression::LZ4 => CompressionCodec::LZ4,
705            Compression::ZSTD(_) => CompressionCodec::ZSTD,
706            Compression::LZ4_RAW => CompressionCodec::LZ4_RAW,
707        }
708    }
709}
710
711fn split_compression_string(str_setting: &str) -> Result<(&str, Option<u32>), ParquetError> {
712    let split_setting = str_setting.split_once('(');
713
714    match split_setting {
715        Some((codec, level_str)) => {
716            let level = &level_str[..level_str.len() - 1]
717                .parse::<u32>()
718                .map_err(|_| {
719                    ParquetError::General(format!("invalid compression level: {level_str}"))
720                })?;
721            Ok((codec, Some(*level)))
722        }
723        None => Ok((str_setting, None)),
724    }
725}
726
727fn check_level_is_none(level: &Option<u32>) -> Result<(), ParquetError> {
728    if level.is_some() {
729        return Err(ParquetError::General(
730            "compression level is not supported".to_string(),
731        ));
732    }
733
734    Ok(())
735}
736
737fn require_level(codec: &str, level: Option<u32>) -> Result<u32, ParquetError> {
738    level.ok_or(ParquetError::General(format!(
739        "{codec} requires a compression level",
740    )))
741}
742
743impl FromStr for Compression {
744    type Err = ParquetError;
745
746    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
747        let (codec, level) = split_compression_string(s)?;
748
749        let c = match codec {
750            "UNCOMPRESSED" | "uncompressed" => {
751                check_level_is_none(&level)?;
752                Compression::UNCOMPRESSED
753            }
754            "SNAPPY" | "snappy" => {
755                check_level_is_none(&level)?;
756                Compression::SNAPPY
757            }
758            "GZIP" | "gzip" => {
759                let level = require_level(codec, level)?;
760                Compression::GZIP(GzipLevel::try_new(level)?)
761            }
762            "LZO" | "lzo" => {
763                check_level_is_none(&level)?;
764                Compression::LZO
765            }
766            "BROTLI" | "brotli" => {
767                let level = require_level(codec, level)?;
768                Compression::BROTLI(BrotliLevel::try_new(level)?)
769            }
770            "LZ4" | "lz4" => {
771                check_level_is_none(&level)?;
772                Compression::LZ4
773            }
774            "ZSTD" | "zstd" => {
775                let level = require_level(codec, level)?;
776                Compression::ZSTD(ZstdLevel::try_new(level as i32)?)
777            }
778            "LZ4_RAW" | "lz4_raw" => {
779                check_level_is_none(&level)?;
780                Compression::LZ4_RAW
781            }
782            _ => {
783                return Err(ParquetError::General(format!(
784                    "unsupport compression {codec}"
785                )));
786            }
787        };
788
789        Ok(c)
790    }
791}
792
793// ----------------------------------------------------------------------
794// Mirrors thrift enum `PageType`
795
796thrift_enum!(
797/// Available data pages for Parquet file format.
798/// Note that some of the page types may not be supported.
799enum PageType {
800  DATA_PAGE = 0;
801  INDEX_PAGE = 1;
802  DICTIONARY_PAGE = 2;
803  DATA_PAGE_V2 = 3;
804}
805);
806
807// ----------------------------------------------------------------------
808// Mirrors thrift enum `BoundaryOrder`
809
810thrift_enum!(
811/// Enum to annotate whether lists of min/max elements inside ColumnIndex
812/// are ordered and if so, in which direction.
813enum BoundaryOrder {
814  UNORDERED = 0;
815  ASCENDING = 1;
816  DESCENDING = 2;
817}
818);
819
820// ----------------------------------------------------------------------
821// Mirrors thrift enum `EdgeInterpolationAlgorithm`
822
823// this is hand coded to allow for the _Unknown variant (allows this to be forward compatible)
824
825/// Edge interpolation algorithm for [`LogicalType::Geography`]
826#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
827#[repr(i32)]
828#[derive(Default)]
829pub enum EdgeInterpolationAlgorithm {
830    /// Edges are interpolated as geodesics on a sphere.
831    #[default]
832    SPHERICAL = 0,
833    /// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
834    VINCENTY = 1,
835    /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970
836    THOMAS = 2,
837    /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965.
838    ANDOYER = 3,
839    /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55
840    KARNEY = 4,
841    /// Unknown algorithm
842    _Unknown(i32),
843}
844
845#[cfg(feature = "geospatial")]
846impl EdgeInterpolationAlgorithm {
847    /// Converts an [`EdgeInterpolationAlgorithm`] into its corresponding algorithm defined by
848    /// [`parquet_geospatial::WkbEdges`].
849    ///
850    /// This method will only return an Err if the [`EdgeInterpolationAlgorithm`] is the `_Unknown`
851    /// variant.
852    pub fn try_as_edges(&self) -> Result<parquet_geospatial::WkbEdges> {
853        match &self {
854            Self::SPHERICAL => Ok(parquet_geospatial::WkbEdges::Spherical),
855            Self::VINCENTY => Ok(parquet_geospatial::WkbEdges::Vincenty),
856            Self::THOMAS => Ok(parquet_geospatial::WkbEdges::Thomas),
857            Self::ANDOYER => Ok(parquet_geospatial::WkbEdges::Andoyer),
858            Self::KARNEY => Ok(parquet_geospatial::WkbEdges::Karney),
859            unknown => Err(general_err!(
860                "Unknown edge interpolation algorithm: {}",
861                unknown
862            )),
863        }
864    }
865}
866
867impl fmt::Display for EdgeInterpolationAlgorithm {
868    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
869        f.write_fmt(format_args!("{0:?}", self))
870    }
871}
872
873#[cfg(feature = "geospatial")]
874impl From<parquet_geospatial::WkbEdges> for EdgeInterpolationAlgorithm {
875    fn from(value: parquet_geospatial::WkbEdges) -> Self {
876        match value {
877            parquet_geospatial::WkbEdges::Spherical => Self::SPHERICAL,
878            parquet_geospatial::WkbEdges::Vincenty => Self::VINCENTY,
879            parquet_geospatial::WkbEdges::Thomas => Self::THOMAS,
880            parquet_geospatial::WkbEdges::Andoyer => Self::ANDOYER,
881            parquet_geospatial::WkbEdges::Karney => Self::KARNEY,
882        }
883    }
884}
885
886impl FromStr for EdgeInterpolationAlgorithm {
887    type Err = ParquetError;
888
889    fn from_str(s: &str) -> Result<Self> {
890        match s.to_ascii_uppercase().as_str() {
891            "SPHERICAL" => Ok(EdgeInterpolationAlgorithm::SPHERICAL),
892            "VINCENTY" => Ok(EdgeInterpolationAlgorithm::VINCENTY),
893            "THOMAS" => Ok(EdgeInterpolationAlgorithm::THOMAS),
894            "ANDOYER" => Ok(EdgeInterpolationAlgorithm::ANDOYER),
895            "KARNEY" => Ok(EdgeInterpolationAlgorithm::KARNEY),
896            unknown => Err(general_err!(
897                "Unknown edge interpolation algorithm: {}",
898                unknown
899            )),
900        }
901    }
902}
903
904impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EdgeInterpolationAlgorithm {
905    fn read_thrift(prot: &mut R) -> Result<Self> {
906        let val = prot.read_i32()?;
907        match val {
908            0 => Ok(Self::SPHERICAL),
909            1 => Ok(Self::VINCENTY),
910            2 => Ok(Self::THOMAS),
911            3 => Ok(Self::ANDOYER),
912            4 => Ok(Self::KARNEY),
913            _ => Ok(Self::_Unknown(val)),
914        }
915    }
916}
917
918impl WriteThrift for EdgeInterpolationAlgorithm {
919    const ELEMENT_TYPE: ElementType = ElementType::I32;
920    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
921        let val: i32 = match *self {
922            Self::SPHERICAL => 0,
923            Self::VINCENTY => 1,
924            Self::THOMAS => 2,
925            Self::ANDOYER => 3,
926            Self::KARNEY => 4,
927            Self::_Unknown(i) => i,
928        };
929        writer.write_i32(val)
930    }
931}
932
933write_thrift_field!(EdgeInterpolationAlgorithm, FieldType::I32);
934
935// ----------------------------------------------------------------------
936// Mirrors thrift union `BloomFilterAlgorithm`
937
938thrift_union_all_empty!(
939/// The algorithm used in Bloom filter.
940union BloomFilterAlgorithm {
941  /// Block-based Bloom filter.
942  1: SplitBlockAlgorithm BLOCK;
943}
944);
945
946// ----------------------------------------------------------------------
947// Mirrors thrift union `BloomFilterHash`
948
949thrift_union_all_empty!(
950/// The hash function used in Bloom filter. This function takes the hash of a column value
951/// using plain encoding.
952union BloomFilterHash {
953  /// xxHash Strategy.
954  1: XxHash XXHASH;
955}
956);
957
958// ----------------------------------------------------------------------
959// Mirrors thrift union `BloomFilterCompression`
960
961thrift_union_all_empty!(
962/// The compression used in the Bloom filter.
963union BloomFilterCompression {
964  1: Uncompressed UNCOMPRESSED;
965}
966);
967
968// ----------------------------------------------------------------------
969// Mirrors thrift union `ColumnOrder`
970
971/// Sort order for page and column statistics.
972///
973/// Types are associated with sort orders and column stats are aggregated using a sort
974/// order, and a sort order should be considered when comparing values with statistics
975/// min/max.
976///
977/// See reference in
978/// <https://github.com/apache/arrow/blob/main/cpp/src/parquet/types.h>
979#[derive(Debug, Clone, Copy, PartialEq, Eq)]
980#[allow(non_camel_case_types)]
981pub enum SortOrder {
982    /// Signed (either value or legacy byte-wise) comparison.
983    SIGNED,
984    /// Unsigned (depending on physical type either value or byte-wise) comparison.
985    UNSIGNED,
986    /// Comparison is undefined.
987    UNDEFINED,
988}
989
990impl SortOrder {
991    /// Returns true if this is [`Self::SIGNED`]
992    pub fn is_signed(&self) -> bool {
993        matches!(self, Self::SIGNED)
994    }
995}
996
997/// Column order that specifies what method was used to aggregate min/max values for
998/// statistics.
999///
1000/// If column order is undefined, then it is the legacy behaviour and all values should
1001/// be compared as signed values/bytes.
1002#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1003#[allow(non_camel_case_types)]
1004pub enum ColumnOrder {
1005    /// Column uses the order defined by its logical or physical type
1006    /// (if there is no logical type), parquet-format 2.4.0+.
1007    TYPE_DEFINED_ORDER(SortOrder),
1008    // The following are not defined in the Parquet spec and should always be last.
1009    /// Undefined column order, means legacy behaviour before parquet-format 2.4.0.
1010    /// Sort order is always SIGNED.
1011    UNDEFINED,
1012    /// An unknown but present ColumnOrder. Statistics with an unknown `ColumnOrder`
1013    /// will be ignored.
1014    UNKNOWN,
1015}
1016
1017impl ColumnOrder {
1018    /// Returns sort order for a physical/logical type.
1019    #[deprecated(
1020        since = "57.1.0",
1021        note = "use `ColumnOrder::sort_order_for_type` instead"
1022    )]
1023    pub fn get_sort_order(
1024        logical_type: Option<LogicalType>,
1025        converted_type: ConvertedType,
1026        physical_type: Type,
1027    ) -> SortOrder {
1028        Self::sort_order_for_type(logical_type.as_ref(), converted_type, physical_type)
1029    }
1030
1031    /// Returns sort order for a physical/logical type.
1032    pub fn sort_order_for_type(
1033        logical_type: Option<&LogicalType>,
1034        converted_type: ConvertedType,
1035        physical_type: Type,
1036    ) -> SortOrder {
1037        match logical_type {
1038            Some(logical) => match logical {
1039                LogicalType::String | LogicalType::Enum | LogicalType::Json | LogicalType::Bson => {
1040                    SortOrder::UNSIGNED
1041                }
1042                LogicalType::Integer(int) => match int.is_signed {
1043                    true => SortOrder::SIGNED,
1044                    false => SortOrder::UNSIGNED,
1045                },
1046                LogicalType::Map | LogicalType::List => SortOrder::UNDEFINED,
1047                LogicalType::Decimal(_) => SortOrder::SIGNED,
1048                LogicalType::Date => SortOrder::SIGNED,
1049                LogicalType::Time(_) => SortOrder::SIGNED,
1050                LogicalType::Timestamp(_) => SortOrder::SIGNED,
1051                LogicalType::Unknown => SortOrder::UNDEFINED,
1052                LogicalType::Uuid => SortOrder::UNSIGNED,
1053                LogicalType::Float16 => SortOrder::SIGNED,
1054                LogicalType::Variant(_)
1055                | LogicalType::Geometry(_)
1056                | LogicalType::Geography(_)
1057                | LogicalType::_Unknown { .. } => SortOrder::UNDEFINED,
1058            },
1059            // Fall back to converted type
1060            None => Self::get_converted_sort_order(converted_type, physical_type),
1061        }
1062    }
1063
1064    fn get_converted_sort_order(converted_type: ConvertedType, physical_type: Type) -> SortOrder {
1065        match converted_type {
1066            // Unsigned byte-wise comparison.
1067            ConvertedType::UTF8
1068            | ConvertedType::JSON
1069            | ConvertedType::BSON
1070            | ConvertedType::ENUM => SortOrder::UNSIGNED,
1071
1072            ConvertedType::INT_8
1073            | ConvertedType::INT_16
1074            | ConvertedType::INT_32
1075            | ConvertedType::INT_64 => SortOrder::SIGNED,
1076
1077            ConvertedType::UINT_8
1078            | ConvertedType::UINT_16
1079            | ConvertedType::UINT_32
1080            | ConvertedType::UINT_64 => SortOrder::UNSIGNED,
1081
1082            // Signed comparison of the represented value.
1083            ConvertedType::DECIMAL => SortOrder::SIGNED,
1084
1085            ConvertedType::DATE => SortOrder::SIGNED,
1086
1087            ConvertedType::TIME_MILLIS
1088            | ConvertedType::TIME_MICROS
1089            | ConvertedType::TIMESTAMP_MILLIS
1090            | ConvertedType::TIMESTAMP_MICROS => SortOrder::SIGNED,
1091
1092            ConvertedType::INTERVAL => SortOrder::UNDEFINED,
1093
1094            ConvertedType::LIST | ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => {
1095                SortOrder::UNDEFINED
1096            }
1097
1098            // Fall back to physical type.
1099            ConvertedType::NONE => Self::get_default_sort_order(physical_type),
1100        }
1101    }
1102
1103    /// Returns default sort order based on physical type.
1104    fn get_default_sort_order(physical_type: Type) -> SortOrder {
1105        match physical_type {
1106            // Order: false, true
1107            Type::BOOLEAN => SortOrder::UNSIGNED,
1108            Type::INT32 | Type::INT64 => SortOrder::SIGNED,
1109            Type::INT96 => SortOrder::UNDEFINED,
1110            // Notes to remember when comparing float/double values:
1111            // If the min is a NaN, it should be ignored.
1112            // If the max is a NaN, it should be ignored.
1113            // If the min is +0, the row group may contain -0 values as well.
1114            // If the max is -0, the row group may contain +0 values as well.
1115            // When looking for NaN values, min and max should be ignored.
1116            Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED,
1117            // Unsigned byte-wise comparison
1118            Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED,
1119        }
1120    }
1121
1122    /// Returns sort order associated with this column order.
1123    pub fn sort_order(&self) -> SortOrder {
1124        match *self {
1125            ColumnOrder::TYPE_DEFINED_ORDER(order) => order,
1126            ColumnOrder::UNDEFINED => SortOrder::SIGNED,
1127            ColumnOrder::UNKNOWN => SortOrder::UNDEFINED,
1128        }
1129    }
1130}
1131
1132impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder {
1133    fn read_thrift(prot: &mut R) -> Result<Self> {
1134        let field_ident = prot.read_field_begin(0)?;
1135        if field_ident.field_type == FieldType::Stop {
1136            return Err(general_err!("Received empty union from remote ColumnOrder"));
1137        }
1138        let ret = match field_ident.id {
1139            1 => {
1140                // NOTE: the sort order needs to be set correctly after parsing.
1141                prot.skip_empty_struct()?;
1142                Self::TYPE_DEFINED_ORDER(SortOrder::SIGNED)
1143            }
1144            _ => {
1145                prot.skip(field_ident.field_type)?;
1146                Self::UNKNOWN
1147            }
1148        };
1149        let field_ident = prot.read_field_begin(field_ident.id)?;
1150        if field_ident.field_type != FieldType::Stop {
1151            return Err(general_err!(
1152                "Received multiple fields for union from remote ColumnOrder"
1153            ));
1154        }
1155        Ok(ret)
1156    }
1157}
1158
1159impl WriteThrift for ColumnOrder {
1160    const ELEMENT_TYPE: ElementType = ElementType::Struct;
1161
1162    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
1163        match *self {
1164            Self::TYPE_DEFINED_ORDER(_) => {
1165                writer.write_field_begin(FieldType::Struct, 1, 0)?;
1166                writer.write_struct_end()?;
1167            }
1168            _ => return Err(general_err!("Attempt to write undefined ColumnOrder")),
1169        }
1170        // write end of struct for this union
1171        writer.write_struct_end()
1172    }
1173}
1174
1175// ----------------------------------------------------------------------
1176// Display handlers
1177
1178impl fmt::Display for Compression {
1179    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1180        write!(f, "{self:?}")
1181    }
1182}
1183
1184impl fmt::Display for SortOrder {
1185    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1186        write!(f, "{self:?}")
1187    }
1188}
1189
1190impl fmt::Display for ColumnOrder {
1191    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1192        write!(f, "{self:?}")
1193    }
1194}
1195
1196// ----------------------------------------------------------------------
1197// LogicalType <=> ConvertedType conversion
1198
1199// Note: To prevent type loss when converting from ConvertedType to LogicalType,
1200// the conversion from ConvertedType -> LogicalType is not implemented.
1201// Such type loss includes:
1202// - Not knowing the decimal scale and precision of ConvertedType
1203// - Time and timestamp nanosecond precision, that is not supported in ConvertedType.
1204
1205impl From<Option<LogicalType>> for ConvertedType {
1206    fn from(value: Option<LogicalType>) -> Self {
1207        match value {
1208            Some(value) => match value {
1209                LogicalType::String => ConvertedType::UTF8,
1210                LogicalType::Map => ConvertedType::MAP,
1211                LogicalType::List => ConvertedType::LIST,
1212                LogicalType::Enum => ConvertedType::ENUM,
1213                LogicalType::Decimal { .. } => ConvertedType::DECIMAL,
1214                LogicalType::Date => ConvertedType::DATE,
1215                LogicalType::Time(time) => match time.unit {
1216                    TimeUnit::MILLIS => ConvertedType::TIME_MILLIS,
1217                    TimeUnit::MICROS => ConvertedType::TIME_MICROS,
1218                    TimeUnit::NANOS => ConvertedType::NONE,
1219                },
1220                LogicalType::Timestamp(time) => match time.unit {
1221                    TimeUnit::MILLIS => ConvertedType::TIMESTAMP_MILLIS,
1222                    TimeUnit::MICROS => ConvertedType::TIMESTAMP_MICROS,
1223                    TimeUnit::NANOS => ConvertedType::NONE,
1224                },
1225                LogicalType::Integer(int_type) => match (int_type.bit_width, int_type.is_signed) {
1226                    (8, true) => ConvertedType::INT_8,
1227                    (16, true) => ConvertedType::INT_16,
1228                    (32, true) => ConvertedType::INT_32,
1229                    (64, true) => ConvertedType::INT_64,
1230                    (8, false) => ConvertedType::UINT_8,
1231                    (16, false) => ConvertedType::UINT_16,
1232                    (32, false) => ConvertedType::UINT_32,
1233                    (64, false) => ConvertedType::UINT_64,
1234                    (bit_width, is_signed) => panic!(
1235                        "Integer type bit_width={bit_width}, signed={is_signed} is not supported"
1236                    ),
1237                },
1238                LogicalType::Json => ConvertedType::JSON,
1239                LogicalType::Bson => ConvertedType::BSON,
1240                LogicalType::Uuid
1241                | LogicalType::Float16
1242                | LogicalType::Variant(_)
1243                | LogicalType::Geometry(_)
1244                | LogicalType::Geography(_)
1245                | LogicalType::_Unknown { .. }
1246                | LogicalType::Unknown => ConvertedType::NONE,
1247            },
1248            None => ConvertedType::NONE,
1249        }
1250    }
1251}
1252
1253// ----------------------------------------------------------------------
1254// String conversions for schema parsing.
1255
1256impl str::FromStr for Repetition {
1257    type Err = ParquetError;
1258
1259    fn from_str(s: &str) -> Result<Self> {
1260        match s {
1261            "REQUIRED" => Ok(Repetition::REQUIRED),
1262            "OPTIONAL" => Ok(Repetition::OPTIONAL),
1263            "REPEATED" => Ok(Repetition::REPEATED),
1264            other => Err(general_err!("Invalid parquet repetition {}", other)),
1265        }
1266    }
1267}
1268
1269impl str::FromStr for Type {
1270    type Err = ParquetError;
1271
1272    fn from_str(s: &str) -> Result<Self> {
1273        match s {
1274            "BOOLEAN" => Ok(Type::BOOLEAN),
1275            "INT32" => Ok(Type::INT32),
1276            "INT64" => Ok(Type::INT64),
1277            "INT96" => Ok(Type::INT96),
1278            "FLOAT" => Ok(Type::FLOAT),
1279            "DOUBLE" => Ok(Type::DOUBLE),
1280            "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY),
1281            "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY),
1282            other => Err(general_err!("Invalid parquet type {}", other)),
1283        }
1284    }
1285}
1286
1287impl str::FromStr for ConvertedType {
1288    type Err = ParquetError;
1289
1290    fn from_str(s: &str) -> Result<Self> {
1291        match s {
1292            "NONE" => Ok(ConvertedType::NONE),
1293            "UTF8" => Ok(ConvertedType::UTF8),
1294            "MAP" => Ok(ConvertedType::MAP),
1295            "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE),
1296            "LIST" => Ok(ConvertedType::LIST),
1297            "ENUM" => Ok(ConvertedType::ENUM),
1298            "DECIMAL" => Ok(ConvertedType::DECIMAL),
1299            "DATE" => Ok(ConvertedType::DATE),
1300            "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS),
1301            "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS),
1302            "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS),
1303            "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS),
1304            "UINT_8" => Ok(ConvertedType::UINT_8),
1305            "UINT_16" => Ok(ConvertedType::UINT_16),
1306            "UINT_32" => Ok(ConvertedType::UINT_32),
1307            "UINT_64" => Ok(ConvertedType::UINT_64),
1308            "INT_8" => Ok(ConvertedType::INT_8),
1309            "INT_16" => Ok(ConvertedType::INT_16),
1310            "INT_32" => Ok(ConvertedType::INT_32),
1311            "INT_64" => Ok(ConvertedType::INT_64),
1312            "JSON" => Ok(ConvertedType::JSON),
1313            "BSON" => Ok(ConvertedType::BSON),
1314            "INTERVAL" => Ok(ConvertedType::INTERVAL),
1315            other => Err(general_err!("Invalid parquet converted type {}", other)),
1316        }
1317    }
1318}
1319
1320impl str::FromStr for LogicalType {
1321    type Err = ParquetError;
1322
1323    fn from_str(s: &str) -> Result<Self> {
1324        match s {
1325            // The type is a placeholder that gets updated elsewhere
1326            "INTEGER" => Ok(LogicalType::integer(8, false)),
1327            "MAP" => Ok(LogicalType::Map),
1328            "LIST" => Ok(LogicalType::List),
1329            "ENUM" => Ok(LogicalType::Enum),
1330            "DECIMAL" => Ok(LogicalType::decimal(-1, -1)),
1331            "DATE" => Ok(LogicalType::Date),
1332            "TIME" => Ok(LogicalType::time(false, TimeUnit::MILLIS)),
1333            "TIMESTAMP" => Ok(LogicalType::timestamp(false, TimeUnit::MILLIS)),
1334            "STRING" => Ok(LogicalType::String),
1335            "JSON" => Ok(LogicalType::Json),
1336            "BSON" => Ok(LogicalType::Bson),
1337            "UUID" => Ok(LogicalType::Uuid),
1338            "UNKNOWN" => Ok(LogicalType::Unknown),
1339            "INTERVAL" => Err(general_err!(
1340                "Interval parquet logical type not yet supported"
1341            )),
1342            "FLOAT16" => Ok(LogicalType::Float16),
1343            "VARIANT" => Ok(LogicalType::variant(None)),
1344            "GEOMETRY" => Ok(LogicalType::geometry(None)),
1345            "GEOGRAPHY" => Ok(LogicalType::geography(
1346                None,
1347                Some(EdgeInterpolationAlgorithm::SPHERICAL),
1348            )),
1349            other => Err(general_err!("Invalid parquet logical type {}", other)),
1350        }
1351    }
1352}
1353
1354#[cfg(test)]
1355#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
1356mod tests {
1357    use super::*;
1358    use crate::parquet_thrift::{ThriftSliceInputProtocol, tests::test_roundtrip};
1359
1360    #[test]
1361    fn test_display_type() {
1362        assert_eq!(Type::BOOLEAN.to_string(), "BOOLEAN");
1363        assert_eq!(Type::INT32.to_string(), "INT32");
1364        assert_eq!(Type::INT64.to_string(), "INT64");
1365        assert_eq!(Type::INT96.to_string(), "INT96");
1366        assert_eq!(Type::FLOAT.to_string(), "FLOAT");
1367        assert_eq!(Type::DOUBLE.to_string(), "DOUBLE");
1368        assert_eq!(Type::BYTE_ARRAY.to_string(), "BYTE_ARRAY");
1369        assert_eq!(
1370            Type::FIXED_LEN_BYTE_ARRAY.to_string(),
1371            "FIXED_LEN_BYTE_ARRAY"
1372        );
1373    }
1374
1375    #[test]
1376    fn test_from_string_into_type() {
1377        assert_eq!(
1378            Type::BOOLEAN.to_string().parse::<Type>().unwrap(),
1379            Type::BOOLEAN
1380        );
1381        assert_eq!(
1382            Type::INT32.to_string().parse::<Type>().unwrap(),
1383            Type::INT32
1384        );
1385        assert_eq!(
1386            Type::INT64.to_string().parse::<Type>().unwrap(),
1387            Type::INT64
1388        );
1389        assert_eq!(
1390            Type::INT96.to_string().parse::<Type>().unwrap(),
1391            Type::INT96
1392        );
1393        assert_eq!(
1394            Type::FLOAT.to_string().parse::<Type>().unwrap(),
1395            Type::FLOAT
1396        );
1397        assert_eq!(
1398            Type::DOUBLE.to_string().parse::<Type>().unwrap(),
1399            Type::DOUBLE
1400        );
1401        assert_eq!(
1402            Type::BYTE_ARRAY.to_string().parse::<Type>().unwrap(),
1403            Type::BYTE_ARRAY
1404        );
1405        assert_eq!("BINARY".parse::<Type>().unwrap(), Type::BYTE_ARRAY);
1406        assert_eq!(
1407            Type::FIXED_LEN_BYTE_ARRAY
1408                .to_string()
1409                .parse::<Type>()
1410                .unwrap(),
1411            Type::FIXED_LEN_BYTE_ARRAY
1412        );
1413    }
1414
1415    #[test]
1416    fn test_converted_type_roundtrip() {
1417        test_roundtrip(ConvertedType::UTF8);
1418        test_roundtrip(ConvertedType::MAP);
1419        test_roundtrip(ConvertedType::MAP_KEY_VALUE);
1420        test_roundtrip(ConvertedType::LIST);
1421        test_roundtrip(ConvertedType::ENUM);
1422        test_roundtrip(ConvertedType::DECIMAL);
1423        test_roundtrip(ConvertedType::DATE);
1424        test_roundtrip(ConvertedType::TIME_MILLIS);
1425        test_roundtrip(ConvertedType::TIME_MICROS);
1426        test_roundtrip(ConvertedType::TIMESTAMP_MILLIS);
1427        test_roundtrip(ConvertedType::TIMESTAMP_MICROS);
1428        test_roundtrip(ConvertedType::UINT_8);
1429        test_roundtrip(ConvertedType::UINT_16);
1430        test_roundtrip(ConvertedType::UINT_32);
1431        test_roundtrip(ConvertedType::UINT_64);
1432        test_roundtrip(ConvertedType::INT_8);
1433        test_roundtrip(ConvertedType::INT_16);
1434        test_roundtrip(ConvertedType::INT_32);
1435        test_roundtrip(ConvertedType::INT_64);
1436        test_roundtrip(ConvertedType::JSON);
1437        test_roundtrip(ConvertedType::BSON);
1438        test_roundtrip(ConvertedType::INTERVAL);
1439    }
1440
1441    #[test]
1442    fn test_read_invalid_converted_type() {
1443        let mut prot = ThriftSliceInputProtocol::new(&[0x7eu8]);
1444        let res = ConvertedType::read_thrift(&mut prot);
1445        assert!(res.is_err());
1446        assert_eq!(
1447            res.unwrap_err().to_string(),
1448            "Parquet error: Unexpected ConvertedType 63"
1449        );
1450    }
1451
1452    #[test]
1453    fn test_display_converted_type() {
1454        assert_eq!(ConvertedType::NONE.to_string(), "NONE");
1455        assert_eq!(ConvertedType::UTF8.to_string(), "UTF8");
1456        assert_eq!(ConvertedType::MAP.to_string(), "MAP");
1457        assert_eq!(ConvertedType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE");
1458        assert_eq!(ConvertedType::LIST.to_string(), "LIST");
1459        assert_eq!(ConvertedType::ENUM.to_string(), "ENUM");
1460        assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL");
1461        assert_eq!(ConvertedType::DATE.to_string(), "DATE");
1462        assert_eq!(ConvertedType::TIME_MILLIS.to_string(), "TIME_MILLIS");
1463        assert_eq!(ConvertedType::DATE.to_string(), "DATE");
1464        assert_eq!(ConvertedType::TIME_MICROS.to_string(), "TIME_MICROS");
1465        assert_eq!(
1466            ConvertedType::TIMESTAMP_MILLIS.to_string(),
1467            "TIMESTAMP_MILLIS"
1468        );
1469        assert_eq!(
1470            ConvertedType::TIMESTAMP_MICROS.to_string(),
1471            "TIMESTAMP_MICROS"
1472        );
1473        assert_eq!(ConvertedType::UINT_8.to_string(), "UINT_8");
1474        assert_eq!(ConvertedType::UINT_16.to_string(), "UINT_16");
1475        assert_eq!(ConvertedType::UINT_32.to_string(), "UINT_32");
1476        assert_eq!(ConvertedType::UINT_64.to_string(), "UINT_64");
1477        assert_eq!(ConvertedType::INT_8.to_string(), "INT_8");
1478        assert_eq!(ConvertedType::INT_16.to_string(), "INT_16");
1479        assert_eq!(ConvertedType::INT_32.to_string(), "INT_32");
1480        assert_eq!(ConvertedType::INT_64.to_string(), "INT_64");
1481        assert_eq!(ConvertedType::JSON.to_string(), "JSON");
1482        assert_eq!(ConvertedType::BSON.to_string(), "BSON");
1483        assert_eq!(ConvertedType::INTERVAL.to_string(), "INTERVAL");
1484        assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL")
1485    }
1486
1487    #[test]
1488    fn test_from_string_into_converted_type() {
1489        assert_eq!(
1490            ConvertedType::NONE
1491                .to_string()
1492                .parse::<ConvertedType>()
1493                .unwrap(),
1494            ConvertedType::NONE
1495        );
1496        assert_eq!(
1497            ConvertedType::UTF8
1498                .to_string()
1499                .parse::<ConvertedType>()
1500                .unwrap(),
1501            ConvertedType::UTF8
1502        );
1503        assert_eq!(
1504            ConvertedType::MAP
1505                .to_string()
1506                .parse::<ConvertedType>()
1507                .unwrap(),
1508            ConvertedType::MAP
1509        );
1510        assert_eq!(
1511            ConvertedType::MAP_KEY_VALUE
1512                .to_string()
1513                .parse::<ConvertedType>()
1514                .unwrap(),
1515            ConvertedType::MAP_KEY_VALUE
1516        );
1517        assert_eq!(
1518            ConvertedType::LIST
1519                .to_string()
1520                .parse::<ConvertedType>()
1521                .unwrap(),
1522            ConvertedType::LIST
1523        );
1524        assert_eq!(
1525            ConvertedType::ENUM
1526                .to_string()
1527                .parse::<ConvertedType>()
1528                .unwrap(),
1529            ConvertedType::ENUM
1530        );
1531        assert_eq!(
1532            ConvertedType::DECIMAL
1533                .to_string()
1534                .parse::<ConvertedType>()
1535                .unwrap(),
1536            ConvertedType::DECIMAL
1537        );
1538        assert_eq!(
1539            ConvertedType::DATE
1540                .to_string()
1541                .parse::<ConvertedType>()
1542                .unwrap(),
1543            ConvertedType::DATE
1544        );
1545        assert_eq!(
1546            ConvertedType::TIME_MILLIS
1547                .to_string()
1548                .parse::<ConvertedType>()
1549                .unwrap(),
1550            ConvertedType::TIME_MILLIS
1551        );
1552        assert_eq!(
1553            ConvertedType::TIME_MICROS
1554                .to_string()
1555                .parse::<ConvertedType>()
1556                .unwrap(),
1557            ConvertedType::TIME_MICROS
1558        );
1559        assert_eq!(
1560            ConvertedType::TIMESTAMP_MILLIS
1561                .to_string()
1562                .parse::<ConvertedType>()
1563                .unwrap(),
1564            ConvertedType::TIMESTAMP_MILLIS
1565        );
1566        assert_eq!(
1567            ConvertedType::TIMESTAMP_MICROS
1568                .to_string()
1569                .parse::<ConvertedType>()
1570                .unwrap(),
1571            ConvertedType::TIMESTAMP_MICROS
1572        );
1573        assert_eq!(
1574            ConvertedType::UINT_8
1575                .to_string()
1576                .parse::<ConvertedType>()
1577                .unwrap(),
1578            ConvertedType::UINT_8
1579        );
1580        assert_eq!(
1581            ConvertedType::UINT_16
1582                .to_string()
1583                .parse::<ConvertedType>()
1584                .unwrap(),
1585            ConvertedType::UINT_16
1586        );
1587        assert_eq!(
1588            ConvertedType::UINT_32
1589                .to_string()
1590                .parse::<ConvertedType>()
1591                .unwrap(),
1592            ConvertedType::UINT_32
1593        );
1594        assert_eq!(
1595            ConvertedType::UINT_64
1596                .to_string()
1597                .parse::<ConvertedType>()
1598                .unwrap(),
1599            ConvertedType::UINT_64
1600        );
1601        assert_eq!(
1602            ConvertedType::INT_8
1603                .to_string()
1604                .parse::<ConvertedType>()
1605                .unwrap(),
1606            ConvertedType::INT_8
1607        );
1608        assert_eq!(
1609            ConvertedType::INT_16
1610                .to_string()
1611                .parse::<ConvertedType>()
1612                .unwrap(),
1613            ConvertedType::INT_16
1614        );
1615        assert_eq!(
1616            ConvertedType::INT_32
1617                .to_string()
1618                .parse::<ConvertedType>()
1619                .unwrap(),
1620            ConvertedType::INT_32
1621        );
1622        assert_eq!(
1623            ConvertedType::INT_64
1624                .to_string()
1625                .parse::<ConvertedType>()
1626                .unwrap(),
1627            ConvertedType::INT_64
1628        );
1629        assert_eq!(
1630            ConvertedType::JSON
1631                .to_string()
1632                .parse::<ConvertedType>()
1633                .unwrap(),
1634            ConvertedType::JSON
1635        );
1636        assert_eq!(
1637            ConvertedType::BSON
1638                .to_string()
1639                .parse::<ConvertedType>()
1640                .unwrap(),
1641            ConvertedType::BSON
1642        );
1643        assert_eq!(
1644            ConvertedType::INTERVAL
1645                .to_string()
1646                .parse::<ConvertedType>()
1647                .unwrap(),
1648            ConvertedType::INTERVAL
1649        );
1650        assert_eq!(
1651            ConvertedType::DECIMAL
1652                .to_string()
1653                .parse::<ConvertedType>()
1654                .unwrap(),
1655            ConvertedType::DECIMAL
1656        )
1657    }
1658
1659    #[test]
1660    fn test_logical_to_converted_type() {
1661        let logical_none: Option<LogicalType> = None;
1662        assert_eq!(ConvertedType::from(logical_none), ConvertedType::NONE);
1663        assert_eq!(
1664            ConvertedType::from(Some(LogicalType::decimal(5, 20))),
1665            ConvertedType::DECIMAL
1666        );
1667        assert_eq!(
1668            ConvertedType::from(Some(LogicalType::Bson)),
1669            ConvertedType::BSON
1670        );
1671        assert_eq!(
1672            ConvertedType::from(Some(LogicalType::Json)),
1673            ConvertedType::JSON
1674        );
1675        assert_eq!(
1676            ConvertedType::from(Some(LogicalType::String)),
1677            ConvertedType::UTF8
1678        );
1679        assert_eq!(
1680            ConvertedType::from(Some(LogicalType::Date)),
1681            ConvertedType::DATE
1682        );
1683        assert_eq!(
1684            ConvertedType::from(Some(LogicalType::time(true, TimeUnit::MILLIS))),
1685            ConvertedType::TIME_MILLIS
1686        );
1687        assert_eq!(
1688            ConvertedType::from(Some(LogicalType::time(true, TimeUnit::MICROS))),
1689            ConvertedType::TIME_MICROS
1690        );
1691        assert_eq!(
1692            ConvertedType::from(Some(LogicalType::time(false, TimeUnit::NANOS))),
1693            ConvertedType::NONE
1694        );
1695        assert_eq!(
1696            ConvertedType::from(Some(LogicalType::timestamp(true, TimeUnit::MILLIS))),
1697            ConvertedType::TIMESTAMP_MILLIS
1698        );
1699        assert_eq!(
1700            ConvertedType::from(Some(LogicalType::timestamp(false, TimeUnit::MICROS))),
1701            ConvertedType::TIMESTAMP_MICROS
1702        );
1703        assert_eq!(
1704            ConvertedType::from(Some(LogicalType::timestamp(false, TimeUnit::NANOS))),
1705            ConvertedType::NONE
1706        );
1707        assert_eq!(
1708            ConvertedType::from(Some(LogicalType::integer(8, false))),
1709            ConvertedType::UINT_8
1710        );
1711        assert_eq!(
1712            ConvertedType::from(Some(LogicalType::integer(8, true))),
1713            ConvertedType::INT_8
1714        );
1715        assert_eq!(
1716            ConvertedType::from(Some(LogicalType::integer(16, false))),
1717            ConvertedType::UINT_16
1718        );
1719        assert_eq!(
1720            ConvertedType::from(Some(LogicalType::integer(16, true))),
1721            ConvertedType::INT_16
1722        );
1723        assert_eq!(
1724            ConvertedType::from(Some(LogicalType::integer(32, false))),
1725            ConvertedType::UINT_32
1726        );
1727        assert_eq!(
1728            ConvertedType::from(Some(LogicalType::integer(32, true))),
1729            ConvertedType::INT_32
1730        );
1731        assert_eq!(
1732            ConvertedType::from(Some(LogicalType::integer(64, false))),
1733            ConvertedType::UINT_64
1734        );
1735        assert_eq!(
1736            ConvertedType::from(Some(LogicalType::integer(64, true))),
1737            ConvertedType::INT_64
1738        );
1739        assert_eq!(
1740            ConvertedType::from(Some(LogicalType::List)),
1741            ConvertedType::LIST
1742        );
1743        assert_eq!(
1744            ConvertedType::from(Some(LogicalType::Map)),
1745            ConvertedType::MAP
1746        );
1747        assert_eq!(
1748            ConvertedType::from(Some(LogicalType::Uuid)),
1749            ConvertedType::NONE
1750        );
1751        assert_eq!(
1752            ConvertedType::from(Some(LogicalType::Enum)),
1753            ConvertedType::ENUM
1754        );
1755        assert_eq!(
1756            ConvertedType::from(Some(LogicalType::Float16)),
1757            ConvertedType::NONE
1758        );
1759        assert_eq!(
1760            ConvertedType::from(Some(LogicalType::variant(None))),
1761            ConvertedType::NONE
1762        );
1763        assert_eq!(
1764            ConvertedType::from(Some(LogicalType::geometry(None))),
1765            ConvertedType::NONE
1766        );
1767        assert_eq!(
1768            ConvertedType::from(Some(LogicalType::geography(None, Some(Default::default())))),
1769            ConvertedType::NONE
1770        );
1771        assert_eq!(
1772            ConvertedType::from(Some(LogicalType::Unknown)),
1773            ConvertedType::NONE
1774        );
1775    }
1776
1777    #[test]
1778    fn test_logical_type_roundtrip() {
1779        test_roundtrip(LogicalType::String);
1780        test_roundtrip(LogicalType::Map);
1781        test_roundtrip(LogicalType::List);
1782        test_roundtrip(LogicalType::Enum);
1783        test_roundtrip(LogicalType::decimal(0, 20));
1784        test_roundtrip(LogicalType::Date);
1785        test_roundtrip(LogicalType::time(true, TimeUnit::MICROS));
1786        test_roundtrip(LogicalType::time(false, TimeUnit::MILLIS));
1787        test_roundtrip(LogicalType::time(false, TimeUnit::NANOS));
1788        test_roundtrip(LogicalType::timestamp(false, TimeUnit::MICROS));
1789        test_roundtrip(LogicalType::timestamp(true, TimeUnit::MILLIS));
1790        test_roundtrip(LogicalType::timestamp(true, TimeUnit::NANOS));
1791        test_roundtrip(LogicalType::integer(8, true));
1792        test_roundtrip(LogicalType::integer(16, false));
1793        test_roundtrip(LogicalType::integer(32, true));
1794        test_roundtrip(LogicalType::integer(64, false));
1795        test_roundtrip(LogicalType::Json);
1796        test_roundtrip(LogicalType::Bson);
1797        test_roundtrip(LogicalType::Uuid);
1798        test_roundtrip(LogicalType::Float16);
1799        test_roundtrip(LogicalType::variant(Some(1)));
1800        test_roundtrip(LogicalType::variant(None));
1801        test_roundtrip(LogicalType::geometry(Some("foo".to_owned())));
1802        test_roundtrip(LogicalType::geometry(None));
1803        test_roundtrip(LogicalType::geography(
1804            Some("foo".to_owned()),
1805            Some(EdgeInterpolationAlgorithm::ANDOYER),
1806        ));
1807        test_roundtrip(LogicalType::geography(
1808            None,
1809            Some(EdgeInterpolationAlgorithm::KARNEY),
1810        ));
1811        test_roundtrip(LogicalType::geography(
1812            Some("foo".to_owned()),
1813            Some(EdgeInterpolationAlgorithm::SPHERICAL),
1814        ));
1815        test_roundtrip(LogicalType::geography(
1816            None,
1817            Some(EdgeInterpolationAlgorithm::SPHERICAL),
1818        ));
1819    }
1820
1821    #[test]
1822    fn test_display_repetition() {
1823        assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED");
1824        assert_eq!(Repetition::OPTIONAL.to_string(), "OPTIONAL");
1825        assert_eq!(Repetition::REPEATED.to_string(), "REPEATED");
1826    }
1827
1828    #[test]
1829    fn test_from_string_into_repetition() {
1830        assert_eq!(
1831            Repetition::REQUIRED
1832                .to_string()
1833                .parse::<Repetition>()
1834                .unwrap(),
1835            Repetition::REQUIRED
1836        );
1837        assert_eq!(
1838            Repetition::OPTIONAL
1839                .to_string()
1840                .parse::<Repetition>()
1841                .unwrap(),
1842            Repetition::OPTIONAL
1843        );
1844        assert_eq!(
1845            Repetition::REPEATED
1846                .to_string()
1847                .parse::<Repetition>()
1848                .unwrap(),
1849            Repetition::REPEATED
1850        );
1851    }
1852
1853    #[test]
1854    fn test_display_encoding() {
1855        assert_eq!(Encoding::PLAIN.to_string(), "PLAIN");
1856        assert_eq!(Encoding::PLAIN_DICTIONARY.to_string(), "PLAIN_DICTIONARY");
1857        assert_eq!(Encoding::RLE.to_string(), "RLE");
1858        assert_eq!(Encoding::BIT_PACKED.to_string(), "BIT_PACKED");
1859        assert_eq!(
1860            Encoding::DELTA_BINARY_PACKED.to_string(),
1861            "DELTA_BINARY_PACKED"
1862        );
1863        assert_eq!(
1864            Encoding::DELTA_LENGTH_BYTE_ARRAY.to_string(),
1865            "DELTA_LENGTH_BYTE_ARRAY"
1866        );
1867        assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY");
1868        assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY");
1869    }
1870
1871    #[test]
1872    fn test_compression_conversion() {
1873        assert_eq!(
1874            CompressionCodec::from(Compression::UNCOMPRESSED),
1875            CompressionCodec::UNCOMPRESSED
1876        );
1877        assert_eq!(
1878            CompressionCodec::from(Compression::SNAPPY),
1879            CompressionCodec::SNAPPY
1880        );
1881        assert_eq!(
1882            CompressionCodec::from(Compression::GZIP(Default::default())),
1883            CompressionCodec::GZIP
1884        );
1885        assert_eq!(
1886            CompressionCodec::from(Compression::LZO),
1887            CompressionCodec::LZO
1888        );
1889        assert_eq!(
1890            CompressionCodec::from(Compression::BROTLI(Default::default())),
1891            CompressionCodec::BROTLI
1892        );
1893        assert_eq!(
1894            CompressionCodec::from(Compression::LZ4),
1895            CompressionCodec::LZ4
1896        );
1897        assert_eq!(
1898            CompressionCodec::from(Compression::ZSTD(Default::default())),
1899            CompressionCodec::ZSTD
1900        );
1901        assert_eq!(
1902            CompressionCodec::from(Compression::LZ4_RAW),
1903            CompressionCodec::LZ4_RAW
1904        );
1905
1906        assert_eq!(
1907            Compression::from(CompressionCodec::UNCOMPRESSED),
1908            Compression::UNCOMPRESSED
1909        );
1910        assert_eq!(
1911            Compression::from(CompressionCodec::SNAPPY),
1912            Compression::SNAPPY
1913        );
1914        assert_eq!(
1915            Compression::from(CompressionCodec::GZIP),
1916            Compression::GZIP(Default::default())
1917        );
1918        assert_eq!(Compression::from(CompressionCodec::LZO), Compression::LZO);
1919        assert_eq!(
1920            Compression::from(CompressionCodec::BROTLI),
1921            Compression::BROTLI(Default::default())
1922        );
1923        assert_eq!(Compression::from(CompressionCodec::LZ4), Compression::LZ4);
1924        assert_eq!(
1925            Compression::from(CompressionCodec::ZSTD),
1926            Compression::ZSTD(Default::default())
1927        );
1928        assert_eq!(
1929            Compression::from(CompressionCodec::LZ4_RAW),
1930            Compression::LZ4_RAW
1931        );
1932    }
1933
1934    #[test]
1935    fn test_display_compression() {
1936        assert_eq!(Compression::UNCOMPRESSED.to_string(), "UNCOMPRESSED");
1937        assert_eq!(Compression::SNAPPY.to_string(), "SNAPPY");
1938        assert_eq!(
1939            Compression::GZIP(Default::default()).to_string(),
1940            "GZIP(GzipLevel(6))"
1941        );
1942        assert_eq!(Compression::LZO.to_string(), "LZO");
1943        assert_eq!(
1944            Compression::BROTLI(Default::default()).to_string(),
1945            "BROTLI(BrotliLevel(1))"
1946        );
1947        assert_eq!(Compression::LZ4.to_string(), "LZ4");
1948        assert_eq!(
1949            Compression::ZSTD(Default::default()).to_string(),
1950            "ZSTD(ZstdLevel(1))"
1951        );
1952    }
1953
1954    #[test]
1955    fn test_display_page_type() {
1956        assert_eq!(PageType::DATA_PAGE.to_string(), "DATA_PAGE");
1957        assert_eq!(PageType::INDEX_PAGE.to_string(), "INDEX_PAGE");
1958        assert_eq!(PageType::DICTIONARY_PAGE.to_string(), "DICTIONARY_PAGE");
1959        assert_eq!(PageType::DATA_PAGE_V2.to_string(), "DATA_PAGE_V2");
1960    }
1961
1962    #[test]
1963    fn test_display_sort_order() {
1964        assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED");
1965        assert_eq!(SortOrder::UNSIGNED.to_string(), "UNSIGNED");
1966        assert_eq!(SortOrder::UNDEFINED.to_string(), "UNDEFINED");
1967    }
1968
1969    #[test]
1970    fn test_display_column_order() {
1971        assert_eq!(
1972            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).to_string(),
1973            "TYPE_DEFINED_ORDER(SIGNED)"
1974        );
1975        assert_eq!(
1976            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).to_string(),
1977            "TYPE_DEFINED_ORDER(UNSIGNED)"
1978        );
1979        assert_eq!(
1980            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).to_string(),
1981            "TYPE_DEFINED_ORDER(UNDEFINED)"
1982        );
1983        assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED");
1984    }
1985
1986    #[test]
1987    fn test_column_order_roundtrip() {
1988        // SortOrder::SIGNED is the default on read.
1989        test_roundtrip(ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED))
1990    }
1991
1992    #[test]
1993    fn test_column_order_get_logical_type_sort_order() {
1994        // Helper to check the order in a list of values.
1995        // Only logical type is checked.
1996        fn check_sort_order(types: Vec<LogicalType>, expected_order: SortOrder) {
1997            for tpe in types {
1998                assert_eq!(
1999                    ColumnOrder::get_sort_order(Some(tpe), ConvertedType::NONE, Type::BYTE_ARRAY),
2000                    expected_order
2001                );
2002            }
2003        }
2004
2005        // Unsigned comparison (physical type does not matter)
2006        let unsigned = vec![
2007            LogicalType::String,
2008            LogicalType::Json,
2009            LogicalType::Bson,
2010            LogicalType::Enum,
2011            LogicalType::Uuid,
2012            LogicalType::integer(8, false),
2013            LogicalType::integer(16, false),
2014            LogicalType::integer(32, false),
2015            LogicalType::integer(64, false),
2016        ];
2017        check_sort_order(unsigned, SortOrder::UNSIGNED);
2018
2019        // Signed comparison (physical type does not matter)
2020        let signed = vec![
2021            LogicalType::integer(8, true),
2022            LogicalType::integer(16, true),
2023            LogicalType::integer(32, true),
2024            LogicalType::integer(64, true),
2025            LogicalType::decimal(20, 4),
2026            LogicalType::Date,
2027            LogicalType::time(false, TimeUnit::MILLIS),
2028            LogicalType::time(false, TimeUnit::MICROS),
2029            LogicalType::time(true, TimeUnit::NANOS),
2030            LogicalType::timestamp(false, TimeUnit::MILLIS),
2031            LogicalType::timestamp(false, TimeUnit::MICROS),
2032            LogicalType::timestamp(true, TimeUnit::NANOS),
2033            LogicalType::Float16,
2034        ];
2035        check_sort_order(signed, SortOrder::SIGNED);
2036
2037        // Undefined comparison
2038        let undefined = vec![
2039            LogicalType::List,
2040            LogicalType::Map,
2041            LogicalType::variant(None),
2042            LogicalType::geometry(None),
2043            LogicalType::geography(None, Some(Default::default())),
2044        ];
2045        check_sort_order(undefined, SortOrder::UNDEFINED);
2046    }
2047
2048    #[test]
2049    fn test_column_order_get_converted_type_sort_order() {
2050        // Helper to check the order in a list of values.
2051        // Only converted type is checked.
2052        fn check_sort_order(types: Vec<ConvertedType>, expected_order: SortOrder) {
2053            for tpe in types {
2054                assert_eq!(
2055                    ColumnOrder::get_sort_order(None, tpe, Type::BYTE_ARRAY),
2056                    expected_order
2057                );
2058            }
2059        }
2060
2061        // Unsigned comparison (physical type does not matter)
2062        let unsigned = vec![
2063            ConvertedType::UTF8,
2064            ConvertedType::JSON,
2065            ConvertedType::BSON,
2066            ConvertedType::ENUM,
2067            ConvertedType::UINT_8,
2068            ConvertedType::UINT_16,
2069            ConvertedType::UINT_32,
2070            ConvertedType::UINT_64,
2071        ];
2072        check_sort_order(unsigned, SortOrder::UNSIGNED);
2073
2074        // Signed comparison (physical type does not matter)
2075        let signed = vec![
2076            ConvertedType::INT_8,
2077            ConvertedType::INT_16,
2078            ConvertedType::INT_32,
2079            ConvertedType::INT_64,
2080            ConvertedType::DECIMAL,
2081            ConvertedType::DATE,
2082            ConvertedType::TIME_MILLIS,
2083            ConvertedType::TIME_MICROS,
2084            ConvertedType::TIMESTAMP_MILLIS,
2085            ConvertedType::TIMESTAMP_MICROS,
2086        ];
2087        check_sort_order(signed, SortOrder::SIGNED);
2088
2089        // Undefined comparison
2090        let undefined = vec![
2091            ConvertedType::LIST,
2092            ConvertedType::MAP,
2093            ConvertedType::MAP_KEY_VALUE,
2094            ConvertedType::INTERVAL,
2095        ];
2096        check_sort_order(undefined, SortOrder::UNDEFINED);
2097
2098        // Check None logical type
2099        // This should return a sort order for byte array type.
2100        check_sort_order(vec![ConvertedType::NONE], SortOrder::UNSIGNED);
2101    }
2102
2103    #[test]
2104    fn test_column_order_get_default_sort_order() {
2105        // Comparison based on physical type
2106        assert_eq!(
2107            ColumnOrder::get_default_sort_order(Type::BOOLEAN),
2108            SortOrder::UNSIGNED
2109        );
2110        assert_eq!(
2111            ColumnOrder::get_default_sort_order(Type::INT32),
2112            SortOrder::SIGNED
2113        );
2114        assert_eq!(
2115            ColumnOrder::get_default_sort_order(Type::INT64),
2116            SortOrder::SIGNED
2117        );
2118        assert_eq!(
2119            ColumnOrder::get_default_sort_order(Type::INT96),
2120            SortOrder::UNDEFINED
2121        );
2122        assert_eq!(
2123            ColumnOrder::get_default_sort_order(Type::FLOAT),
2124            SortOrder::SIGNED
2125        );
2126        assert_eq!(
2127            ColumnOrder::get_default_sort_order(Type::DOUBLE),
2128            SortOrder::SIGNED
2129        );
2130        assert_eq!(
2131            ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY),
2132            SortOrder::UNSIGNED
2133        );
2134        assert_eq!(
2135            ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY),
2136            SortOrder::UNSIGNED
2137        );
2138    }
2139
2140    #[test]
2141    fn test_column_order_sort_order() {
2142        assert_eq!(
2143            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).sort_order(),
2144            SortOrder::SIGNED
2145        );
2146        assert_eq!(
2147            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).sort_order(),
2148            SortOrder::UNSIGNED
2149        );
2150        assert_eq!(
2151            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).sort_order(),
2152            SortOrder::UNDEFINED
2153        );
2154        assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED);
2155    }
2156
2157    #[test]
2158    fn test_parse_encoding() {
2159        let mut encoding: Encoding = "PLAIN".parse().unwrap();
2160        assert_eq!(encoding, Encoding::PLAIN);
2161        encoding = "PLAIN_DICTIONARY".parse().unwrap();
2162        assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
2163        encoding = "RLE".parse().unwrap();
2164        assert_eq!(encoding, Encoding::RLE);
2165        encoding = "BIT_PACKED".parse().unwrap();
2166        assert_eq!(encoding, Encoding::BIT_PACKED);
2167        encoding = "DELTA_BINARY_PACKED".parse().unwrap();
2168        assert_eq!(encoding, Encoding::DELTA_BINARY_PACKED);
2169        encoding = "DELTA_LENGTH_BYTE_ARRAY".parse().unwrap();
2170        assert_eq!(encoding, Encoding::DELTA_LENGTH_BYTE_ARRAY);
2171        encoding = "DELTA_BYTE_ARRAY".parse().unwrap();
2172        assert_eq!(encoding, Encoding::DELTA_BYTE_ARRAY);
2173        encoding = "RLE_DICTIONARY".parse().unwrap();
2174        assert_eq!(encoding, Encoding::RLE_DICTIONARY);
2175        encoding = "BYTE_STREAM_SPLIT".parse().unwrap();
2176        assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
2177
2178        // test lowercase
2179        encoding = "byte_stream_split".parse().unwrap();
2180        assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
2181
2182        // test unknown string
2183        match "plain_xxx".parse::<Encoding>() {
2184            Ok(e) => {
2185                panic!("Should not be able to parse {e:?}");
2186            }
2187            Err(e) => {
2188                assert_eq!(e.to_string(), "Parquet error: unknown encoding: plain_xxx");
2189            }
2190        }
2191    }
2192
2193    #[test]
2194    fn test_parse_compression() {
2195        let mut compress: Compression = "snappy".parse().unwrap();
2196        assert_eq!(compress, Compression::SNAPPY);
2197        compress = "lzo".parse().unwrap();
2198        assert_eq!(compress, Compression::LZO);
2199        compress = "zstd(3)".parse().unwrap();
2200        assert_eq!(compress, Compression::ZSTD(ZstdLevel::try_new(3).unwrap()));
2201        compress = "LZ4_RAW".parse().unwrap();
2202        assert_eq!(compress, Compression::LZ4_RAW);
2203        compress = "uncompressed".parse().unwrap();
2204        assert_eq!(compress, Compression::UNCOMPRESSED);
2205        compress = "snappy".parse().unwrap();
2206        assert_eq!(compress, Compression::SNAPPY);
2207        compress = "gzip(9)".parse().unwrap();
2208        assert_eq!(compress, Compression::GZIP(GzipLevel::try_new(9).unwrap()));
2209        compress = "lzo".parse().unwrap();
2210        assert_eq!(compress, Compression::LZO);
2211        compress = "brotli(3)".parse().unwrap();
2212        assert_eq!(
2213            compress,
2214            Compression::BROTLI(BrotliLevel::try_new(3).unwrap())
2215        );
2216        compress = "lz4".parse().unwrap();
2217        assert_eq!(compress, Compression::LZ4);
2218
2219        // test unknown compression
2220        let mut err = "plain_xxx".parse::<Encoding>().unwrap_err();
2221        assert_eq!(
2222            err.to_string(),
2223            "Parquet error: unknown encoding: plain_xxx"
2224        );
2225
2226        // test invalid compress level
2227        err = "gzip(-10)".parse::<Encoding>().unwrap_err();
2228        assert_eq!(
2229            err.to_string(),
2230            "Parquet error: unknown encoding: gzip(-10)"
2231        );
2232    }
2233
2234    #[test]
2235    fn test_display_boundary_order() {
2236        assert_eq!(BoundaryOrder::ASCENDING.to_string(), "ASCENDING");
2237        assert_eq!(BoundaryOrder::DESCENDING.to_string(), "DESCENDING");
2238        assert_eq!(BoundaryOrder::UNORDERED.to_string(), "UNORDERED");
2239    }
2240
2241    #[test]
2242    fn test_display_edge_algo() {
2243        assert_eq!(
2244            EdgeInterpolationAlgorithm::SPHERICAL.to_string(),
2245            "SPHERICAL"
2246        );
2247        assert_eq!(EdgeInterpolationAlgorithm::VINCENTY.to_string(), "VINCENTY");
2248        assert_eq!(EdgeInterpolationAlgorithm::THOMAS.to_string(), "THOMAS");
2249        assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER");
2250        assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY");
2251    }
2252
2253    #[test]
2254    fn test_from_str_edge_algo() {
2255        assert_eq!(
2256            "spHErical".parse::<EdgeInterpolationAlgorithm>().unwrap(),
2257            EdgeInterpolationAlgorithm::SPHERICAL
2258        );
2259        assert_eq!(
2260            "vinceNTY".parse::<EdgeInterpolationAlgorithm>().unwrap(),
2261            EdgeInterpolationAlgorithm::VINCENTY
2262        );
2263        assert_eq!(
2264            "tHOmas".parse::<EdgeInterpolationAlgorithm>().unwrap(),
2265            EdgeInterpolationAlgorithm::THOMAS
2266        );
2267        assert_eq!(
2268            "anDOYEr".parse::<EdgeInterpolationAlgorithm>().unwrap(),
2269            EdgeInterpolationAlgorithm::ANDOYER
2270        );
2271        assert_eq!(
2272            "kaRNey".parse::<EdgeInterpolationAlgorithm>().unwrap(),
2273            EdgeInterpolationAlgorithm::KARNEY
2274        );
2275        assert!(
2276            "does not exist"
2277                .parse::<EdgeInterpolationAlgorithm>()
2278                .is_err()
2279        );
2280    }
2281
2282    fn encodings_roundtrip(mut encodings: Vec<Encoding>) {
2283        encodings.sort();
2284        let mask = EncodingMask::new_from_encodings(encodings.iter());
2285        assert!(mask.all_set(encodings.iter()));
2286        let v = mask.encodings().collect::<Vec<_>>();
2287        assert_eq!(v, encodings);
2288    }
2289
2290    #[test]
2291    fn test_encoding_roundtrip() {
2292        encodings_roundtrip(
2293            [
2294                Encoding::RLE,
2295                Encoding::PLAIN,
2296                Encoding::DELTA_BINARY_PACKED,
2297            ]
2298            .into(),
2299        );
2300        encodings_roundtrip([Encoding::RLE_DICTIONARY, Encoding::PLAIN_DICTIONARY].into());
2301        encodings_roundtrip([].into());
2302        let encodings = [
2303            Encoding::PLAIN,
2304            Encoding::BIT_PACKED,
2305            Encoding::RLE,
2306            Encoding::DELTA_BINARY_PACKED,
2307            Encoding::DELTA_BYTE_ARRAY,
2308            Encoding::DELTA_LENGTH_BYTE_ARRAY,
2309            Encoding::PLAIN_DICTIONARY,
2310            Encoding::RLE_DICTIONARY,
2311            Encoding::BYTE_STREAM_SPLIT,
2312        ];
2313        encodings_roundtrip(encodings.into());
2314    }
2315
2316    #[test]
2317    fn test_invalid_encoding_mask() {
2318        // any set bits higher than the max should trigger an error
2319        let res = EncodingMask::try_new(-1);
2320        assert!(res.is_err());
2321        let err = res.unwrap_err();
2322        assert_eq!(
2323            err.to_string(),
2324            "Parquet error: Attempt to create invalid mask: 0xffffffff"
2325        );
2326
2327        // test that GROUP_VAR_INT is disallowed
2328        let res = EncodingMask::try_new(2);
2329        assert!(res.is_err());
2330        let err = res.unwrap_err();
2331        assert_eq!(
2332            err.to_string(),
2333            "Parquet error: Attempt to create invalid mask: 0x2"
2334        );
2335    }
2336
2337    #[test]
2338    fn test_encoding_mask_is_only() {
2339        let mask = EncodingMask::new_from_encodings([Encoding::PLAIN].iter());
2340        assert!(mask.is_only(Encoding::PLAIN));
2341
2342        let mask =
2343            EncodingMask::new_from_encodings([Encoding::PLAIN, Encoding::PLAIN_DICTIONARY].iter());
2344        assert!(!mask.is_only(Encoding::PLAIN));
2345    }
2346}