parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::max_statistics_size`]
46#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
47pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
48/// Default value for [`WriterProperties::max_row_group_size`]
49pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
50/// Default value for [`WriterProperties::bloom_filter_position`]
51pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
52/// Default value for [`WriterProperties::created_by`]
53pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
54/// Default value for [`WriterProperties::column_index_truncate_length`]
55pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
56/// Default value for [`BloomFilterProperties::fpp`]
57pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
58/// Default value for [`BloomFilterProperties::ndv`]
59pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
60/// Default values for [`WriterProperties::statistics_truncate_length`]
61pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
62/// Default value for [`WriterProperties::offset_index_disabled`]
63pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
64/// Default values for [`WriterProperties::coerce_types`]
65pub const DEFAULT_COERCE_TYPES: bool = false;
66
67/// Parquet writer version.
68///
69/// Basic constant, which is not part of the Thrift definition.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71#[allow(non_camel_case_types)]
72pub enum WriterVersion {
73    /// Parquet format version 1.0
74    PARQUET_1_0,
75    /// Parquet format version 2.0
76    PARQUET_2_0,
77}
78
79impl WriterVersion {
80    /// Returns writer version as `i32`.
81    pub fn as_num(&self) -> i32 {
82        match self {
83            WriterVersion::PARQUET_1_0 => 1,
84            WriterVersion::PARQUET_2_0 => 2,
85        }
86    }
87}
88
89impl FromStr for WriterVersion {
90    type Err = String;
91
92    fn from_str(s: &str) -> Result<Self, Self::Err> {
93        match s {
94            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
95            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
96            _ => Err(format!("Invalid writer version: {}", s)),
97        }
98    }
99}
100
101/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
102/// write Bloom filters
103///
104/// Basic constant, which is not part of the Thrift definition.
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum BloomFilterPosition {
107    /// Write Bloom Filters of each row group right after the row group
108    ///
109    /// This saves memory by writing it as soon as it is computed, at the cost
110    /// of data locality for readers
111    AfterRowGroup,
112    /// Write Bloom Filters at the end of the file
113    ///
114    /// This allows better data locality for readers, at the cost of memory usage
115    /// for writers.
116    End,
117}
118
119/// Reference counted writer properties.
120pub type WriterPropertiesPtr = Arc<WriterProperties>;
121
122/// Configuration settings for writing parquet files.
123///
124/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
125///
126/// # Example
127///
128/// ```rust
129/// # use parquet::{
130/// #    basic::{Compression, Encoding},
131/// #    file::properties::*,
132/// #    schema::types::ColumnPath,
133/// # };
134/// #
135/// // Create properties with default configuration.
136/// let props = WriterProperties::default();
137///
138/// // Use properties builder to set certain options and assemble the configuration.
139/// let props = WriterProperties::builder()
140///     .set_writer_version(WriterVersion::PARQUET_1_0)
141///     .set_encoding(Encoding::PLAIN)
142///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
143///     .set_compression(Compression::SNAPPY)
144///     .build();
145///
146/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
147/// assert_eq!(
148///     props.encoding(&ColumnPath::from("col1")),
149///     Some(Encoding::DELTA_BINARY_PACKED)
150/// );
151/// assert_eq!(
152///     props.encoding(&ColumnPath::from("col2")),
153///     Some(Encoding::PLAIN)
154/// );
155/// ```
156#[derive(Debug, Clone)]
157pub struct WriterProperties {
158    data_page_size_limit: usize,
159    dictionary_page_size_limit: usize,
160    data_page_row_count_limit: usize,
161    write_batch_size: usize,
162    max_row_group_size: usize,
163    bloom_filter_position: BloomFilterPosition,
164    writer_version: WriterVersion,
165    created_by: String,
166    offset_index_disabled: bool,
167    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
168    default_column_properties: ColumnProperties,
169    column_properties: HashMap<ColumnPath, ColumnProperties>,
170    sorting_columns: Option<Vec<SortingColumn>>,
171    column_index_truncate_length: Option<usize>,
172    statistics_truncate_length: Option<usize>,
173    coerce_types: bool,
174    #[cfg(feature = "encryption")]
175    pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
176}
177
178impl Default for WriterProperties {
179    fn default() -> Self {
180        Self::builder().build()
181    }
182}
183
184impl WriterProperties {
185    /// Create a new [`WriterProperties`] with the default settings
186    ///
187    /// See [`WriterProperties::builder`] for customising settings
188    pub fn new() -> Self {
189        Self::default()
190    }
191
192    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
193    /// properties.
194    pub fn builder() -> WriterPropertiesBuilder {
195        WriterPropertiesBuilder::with_defaults()
196    }
197
198    /// Returns data page size limit.
199    ///
200    /// Note: this is a best effort limit based on the write batch size
201    ///
202    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
203    pub fn data_page_size_limit(&self) -> usize {
204        self.data_page_size_limit
205    }
206
207    /// Returns dictionary page size limit.
208    ///
209    /// Note: this is a best effort limit based on the write batch size
210    ///
211    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
212    pub fn dictionary_page_size_limit(&self) -> usize {
213        self.dictionary_page_size_limit
214    }
215
216    /// Returns the maximum page row count
217    ///
218    /// Note: this is a best effort limit based on the write batch size
219    ///
220    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
221    pub fn data_page_row_count_limit(&self) -> usize {
222        self.data_page_row_count_limit
223    }
224
225    /// Returns configured batch size for writes.
226    ///
227    /// When writing a batch of data, this setting allows to split it internally into
228    /// smaller batches so we can better estimate the size of a page currently being
229    /// written.
230    pub fn write_batch_size(&self) -> usize {
231        self.write_batch_size
232    }
233
234    /// Returns maximum number of rows in a row group.
235    pub fn max_row_group_size(&self) -> usize {
236        self.max_row_group_size
237    }
238
239    /// Returns bloom filter position.
240    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
241        self.bloom_filter_position
242    }
243
244    /// Returns configured writer version.
245    pub fn writer_version(&self) -> WriterVersion {
246        self.writer_version
247    }
248
249    /// Returns `created_by` string.
250    pub fn created_by(&self) -> &str {
251        &self.created_by
252    }
253
254    /// Returns `true` if offset index writing is disabled.
255    pub fn offset_index_disabled(&self) -> bool {
256        // If page statistics are to be collected, then do not disable the offset indexes.
257        let default_page_stats_enabled =
258            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
259        let column_page_stats_enabled = self
260            .column_properties
261            .iter()
262            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
263        if default_page_stats_enabled || column_page_stats_enabled {
264            return false;
265        }
266
267        self.offset_index_disabled
268    }
269
270    /// Returns `key_value_metadata` KeyValue pairs.
271    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
272        self.key_value_metadata.as_ref()
273    }
274
275    /// Returns sorting columns.
276    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
277        self.sorting_columns.as_ref()
278    }
279
280    /// Returns the maximum length of truncated min/max values in the column index.
281    ///
282    /// `None` if truncation is disabled, must be greater than 0 otherwise.
283    pub fn column_index_truncate_length(&self) -> Option<usize> {
284        self.column_index_truncate_length
285    }
286
287    /// Returns the maximum length of truncated min/max values in statistics.
288    ///
289    /// `None` if truncation is disabled, must be greater than 0 otherwise.
290    pub fn statistics_truncate_length(&self) -> Option<usize> {
291        self.statistics_truncate_length
292    }
293
294    /// Returns `true` if type coercion is enabled.
295    pub fn coerce_types(&self) -> bool {
296        self.coerce_types
297    }
298
299    /// Returns encoding for a data page, when dictionary encoding is enabled.
300    /// This is not configurable.
301    #[inline]
302    pub fn dictionary_data_page_encoding(&self) -> Encoding {
303        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
304        // Dictionary values are encoded using RLE_DICTIONARY encoding.
305        Encoding::RLE_DICTIONARY
306    }
307
308    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
309    /// This is not configurable.
310    #[inline]
311    pub fn dictionary_page_encoding(&self) -> Encoding {
312        // PLAIN_DICTIONARY is deprecated in writer version 1.
313        // Dictionary is encoded using plain encoding.
314        Encoding::PLAIN
315    }
316
317    /// Returns encoding for a column, if set.
318    /// In case when dictionary is enabled, returns fallback encoding.
319    ///
320    /// If encoding is not set, then column writer will choose the best encoding
321    /// based on the column type.
322    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
323        self.column_properties
324            .get(col)
325            .and_then(|c| c.encoding())
326            .or_else(|| self.default_column_properties.encoding())
327    }
328
329    /// Returns compression codec for a column.
330    pub fn compression(&self, col: &ColumnPath) -> Compression {
331        self.column_properties
332            .get(col)
333            .and_then(|c| c.compression())
334            .or_else(|| self.default_column_properties.compression())
335            .unwrap_or(DEFAULT_COMPRESSION)
336    }
337
338    /// Returns `true` if dictionary encoding is enabled for a column.
339    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
340        self.column_properties
341            .get(col)
342            .and_then(|c| c.dictionary_enabled())
343            .or_else(|| self.default_column_properties.dictionary_enabled())
344            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
345    }
346
347    /// Returns which statistics are written for a column.
348    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
349        self.column_properties
350            .get(col)
351            .and_then(|c| c.statistics_enabled())
352            .or_else(|| self.default_column_properties.statistics_enabled())
353            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
354    }
355
356    /// Returns max size for statistics.
357    /// Only applicable if statistics are enabled.
358    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
359    pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
360        #[allow(deprecated)]
361        self.column_properties
362            .get(col)
363            .and_then(|c| c.max_statistics_size())
364            .or_else(|| self.default_column_properties.max_statistics_size())
365            .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
366    }
367
368    /// Returns the [`BloomFilterProperties`] for the given column
369    ///
370    /// Returns `None` if bloom filter is disabled
371    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
372        self.column_properties
373            .get(col)
374            .and_then(|c| c.bloom_filter_properties())
375            .or_else(|| self.default_column_properties.bloom_filter_properties())
376    }
377
378    /// Return file encryption properties
379    #[cfg(feature = "encryption")]
380    pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
381        self.file_encryption_properties.as_ref()
382    }
383}
384
385/// Builder for  [`WriterProperties`] parquet writer configuration.
386///
387/// See example on [`WriterProperties`]
388pub struct WriterPropertiesBuilder {
389    data_page_size_limit: usize,
390    dictionary_page_size_limit: usize,
391    data_page_row_count_limit: usize,
392    write_batch_size: usize,
393    max_row_group_size: usize,
394    bloom_filter_position: BloomFilterPosition,
395    writer_version: WriterVersion,
396    created_by: String,
397    offset_index_disabled: bool,
398    key_value_metadata: Option<Vec<KeyValue>>,
399    default_column_properties: ColumnProperties,
400    column_properties: HashMap<ColumnPath, ColumnProperties>,
401    sorting_columns: Option<Vec<SortingColumn>>,
402    column_index_truncate_length: Option<usize>,
403    statistics_truncate_length: Option<usize>,
404    coerce_types: bool,
405    #[cfg(feature = "encryption")]
406    file_encryption_properties: Option<FileEncryptionProperties>,
407}
408
409impl WriterPropertiesBuilder {
410    /// Returns default state of the builder.
411    fn with_defaults() -> Self {
412        Self {
413            data_page_size_limit: DEFAULT_PAGE_SIZE,
414            dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
415            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
416            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
417            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
418            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
419            writer_version: DEFAULT_WRITER_VERSION,
420            created_by: DEFAULT_CREATED_BY.to_string(),
421            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
422            key_value_metadata: None,
423            default_column_properties: Default::default(),
424            column_properties: HashMap::new(),
425            sorting_columns: None,
426            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
427            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
428            coerce_types: DEFAULT_COERCE_TYPES,
429            #[cfg(feature = "encryption")]
430            file_encryption_properties: None,
431        }
432    }
433
434    /// Finalizes the configuration and returns immutable writer properties struct.
435    pub fn build(self) -> WriterProperties {
436        WriterProperties {
437            data_page_size_limit: self.data_page_size_limit,
438            dictionary_page_size_limit: self.dictionary_page_size_limit,
439            data_page_row_count_limit: self.data_page_row_count_limit,
440            write_batch_size: self.write_batch_size,
441            max_row_group_size: self.max_row_group_size,
442            bloom_filter_position: self.bloom_filter_position,
443            writer_version: self.writer_version,
444            created_by: self.created_by,
445            offset_index_disabled: self.offset_index_disabled,
446            key_value_metadata: self.key_value_metadata,
447            default_column_properties: self.default_column_properties,
448            column_properties: self.column_properties,
449            sorting_columns: self.sorting_columns,
450            column_index_truncate_length: self.column_index_truncate_length,
451            statistics_truncate_length: self.statistics_truncate_length,
452            coerce_types: self.coerce_types,
453            #[cfg(feature = "encryption")]
454            file_encryption_properties: self.file_encryption_properties,
455        }
456    }
457
458    // ----------------------------------------------------------------------
459    // Writer properties related to a file
460
461    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
462    ///
463    /// This value can determine what features some readers will support.
464    ///
465    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
466    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
467        self.writer_version = value;
468        self
469    }
470
471    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
472    ///
473    /// The parquet writer will attempt to limit the sizes of each
474    /// `DataPage` to this many bytes. Reducing this value will result
475    /// in larger parquet files, but may improve the effectiveness of
476    /// page index based predicate pushdown during reading.
477    ///
478    /// Note: this is a best effort limit based on value of
479    /// [`set_write_batch_size`](Self::set_write_batch_size).
480    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
481        self.data_page_size_limit = value;
482        self
483    }
484
485    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
486    ///
487    /// The parquet writer will attempt to limit the number of rows in
488    /// each `DataPage` to this value. Reducing this value will result
489    /// in larger parquet files, but may improve the effectiveness of
490    /// page index based predicate pushdown during reading.
491    ///
492    /// Note: this is a best effort limit based on value of
493    /// [`set_write_batch_size`](Self::set_write_batch_size).
494    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
495        self.data_page_row_count_limit = value;
496        self
497    }
498
499    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
500    ///
501    /// The parquet writer will attempt to limit the size of each
502    /// `DataPage` used to store dictionaries to this many
503    /// bytes. Reducing this value will result in larger parquet
504    /// files, but may improve the effectiveness of page index based
505    /// predicate pushdown during reading.
506    ///
507    /// Note: this is a best effort limit based on value of
508    /// [`set_write_batch_size`](Self::set_write_batch_size).
509    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
510        self.dictionary_page_size_limit = value;
511        self
512    }
513
514    /// Sets write batch size (defaults to 1024).
515    ///
516    /// For performance reasons, data for each column is written in
517    /// batches of this size.
518    ///
519    /// Additional limits such as such as
520    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
521    /// are checked between batches, and thus the write batch size value acts as an
522    /// upper-bound on the enforcement granularity of other limits.
523    pub fn set_write_batch_size(mut self, value: usize) -> Self {
524        self.write_batch_size = value;
525        self
526    }
527
528    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
529    ///
530    /// # Panics
531    /// If the value is set to 0.
532    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
533        assert!(value > 0, "Cannot have a 0 max row group size");
534        self.max_row_group_size = value;
535        self
536    }
537
538    /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
539    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
540        self.bloom_filter_position = value;
541        self
542    }
543
544    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
545    pub fn set_created_by(mut self, value: String) -> Self {
546        self.created_by = value;
547        self
548    }
549
550    /// Sets whether the writing of offset indexes is disabled (defaults to `false`).
551    ///
552    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
553    ///
554    /// Note: As the offset indexes are useful for accessing data by row number,
555    /// they are always written by default, regardless of whether other statistics
556    /// are enabled. Disabling this metadata may result in a degradation in read
557    /// performance, so use this option with care.
558    ///
559    /// [`Page`]: EnabledStatistics::Page
560    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
561        self.offset_index_disabled = value;
562        self
563    }
564
565    /// Sets "key_value_metadata" property (defaults to `None`).
566    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
567        self.key_value_metadata = value;
568        self
569    }
570
571    /// Sets sorting order of rows in the row group if any (defaults to `None`).
572    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
573        self.sorting_columns = value;
574        self
575    }
576
577    // ----------------------------------------------------------------------
578    // Setters for any column (global)
579
580    /// Sets default encoding for all columns.
581    ///
582    /// If dictionary is not enabled, this is treated as a primary encoding for all
583    /// columns. In case when dictionary is enabled for any column, this value is
584    /// considered to be a fallback encoding for that column.
585    ///
586    /// # Panics
587    ///
588    /// if dictionary encoding is specified, regardless of dictionary
589    /// encoding flag being set.
590    pub fn set_encoding(mut self, value: Encoding) -> Self {
591        self.default_column_properties.set_encoding(value);
592        self
593    }
594
595    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
596    ///
597    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
598    pub fn set_compression(mut self, value: Compression) -> Self {
599        self.default_column_properties.set_compression(value);
600        self
601    }
602
603    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
604    ///
605    /// Use this method to set dictionary encoding, instead of explicitly specifying
606    /// encoding in `set_encoding` method.
607    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
608        self.default_column_properties.set_dictionary_enabled(value);
609        self
610    }
611
612    /// Sets default statistics level for all columns (defaults to [`Page`]).
613    ///
614    /// [`Page`]: EnabledStatistics::Page
615    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
616        self.default_column_properties.set_statistics_enabled(value);
617        self
618    }
619
620    /// Sets default max statistics size for all columns (defaults to `4096`).
621    ///
622    /// Applicable only if statistics are enabled.
623    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
624    pub fn set_max_statistics_size(mut self, value: usize) -> Self {
625        #[allow(deprecated)]
626        self.default_column_properties
627            .set_max_statistics_size(value);
628        self
629    }
630
631    /// Sets if bloom filter is enabled by default for all columns (defaults to `false`).
632    ///
633    /// # Notes
634    ///
635    /// * If the bloom filter is enabled previously then it is a no-op.
636    ///
637    /// * If the bloom filter is not enabled, default values for ndv and fpp
638    ///   value are used used. See [`set_bloom_filter_ndv`] and
639    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
640    ///
641    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
642    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
643    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
644        self.default_column_properties
645            .set_bloom_filter_enabled(value);
646        self
647    }
648
649    /// Sets the default target bloom filter false positive probability (fpp)
650    /// for all columns (defaults to `0.05`).
651    ///
652    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
653    /// been called.
654    ///
655    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
656    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
657        self.default_column_properties.set_bloom_filter_fpp(value);
658        self
659    }
660
661    /// Sets default number of distinct values (ndv) for bloom filter for all
662    /// columns (defaults to `1_000_000`).
663    ///
664    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
665    /// been called.
666    ///
667    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
668    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
669        self.default_column_properties.set_bloom_filter_ndv(value);
670        self
671    }
672
673    // ----------------------------------------------------------------------
674    // Setters for a specific column
675
676    /// Helper method to get existing or new mutable reference of column properties.
677    #[inline]
678    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
679        self.column_properties.entry(col).or_default()
680    }
681
682    /// Sets encoding for a specific column.
683    ///
684    /// Takes precedence over [`Self::set_encoding`].
685    ///
686    /// If dictionary is not enabled, this is treated as a primary encoding for this
687    /// column. In case when dictionary is enabled for this column, either through
688    /// global defaults or explicitly, this value is considered to be a fallback
689    /// encoding for this column.
690    ///
691    /// # Panics
692    /// If user tries to set dictionary encoding here, regardless of dictionary
693    /// encoding flag being set.
694    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
695        self.get_mut_props(col).set_encoding(value);
696        self
697    }
698
699    /// Sets compression codec for a specific column.
700    ///
701    /// Takes precedence over [`Self::set_compression`].
702    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
703        self.get_mut_props(col).set_compression(value);
704        self
705    }
706
707    /// Sets flag to enable/disable dictionary encoding for a specific column.
708    ///
709    /// Takes precedence over [`Self::set_dictionary_enabled`].
710    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
711        self.get_mut_props(col).set_dictionary_enabled(value);
712        self
713    }
714
715    /// Sets statistics level for a specific column.
716    ///
717    /// Takes precedence over [`Self::set_statistics_enabled`].
718    pub fn set_column_statistics_enabled(
719        mut self,
720        col: ColumnPath,
721        value: EnabledStatistics,
722    ) -> Self {
723        self.get_mut_props(col).set_statistics_enabled(value);
724        self
725    }
726
727    /// Sets max size for statistics for a specific column.
728    ///
729    /// Takes precedence over [`Self::set_max_statistics_size`].
730    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
731    pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
732        #[allow(deprecated)]
733        self.get_mut_props(col).set_max_statistics_size(value);
734        self
735    }
736
737    /// Sets whether a bloom filter should be written for a specific column.
738    ///
739    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
740    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
741        self.get_mut_props(col).set_bloom_filter_enabled(value);
742        self
743    }
744
745    /// Sets the false positive probability for bloom filter for a specific column.
746    ///
747    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
748    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
749        self.get_mut_props(col).set_bloom_filter_fpp(value);
750        self
751    }
752
753    /// Sets the number of distinct values for bloom filter for a specific column.
754    ///
755    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
756    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
757        self.get_mut_props(col).set_bloom_filter_ndv(value);
758        self
759    }
760
761    /// Sets the max length of min/max value fields when writing the column
762    /// [`Index`] (defaults to `None`).
763    ///
764    /// This can be used to prevent columns with very long values (hundreds of
765    /// bytes long) from causing the parquet metadata to become huge.
766    ///
767    /// # Notes
768    ///
769    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
770    /// set to [`EnabledStatistics::Page`].
771    ///
772    /// * If `Some`, must be greater than 0, otherwise will panic
773    /// * If `None`, there's no effective limit.
774    ///
775    /// [`Index`]: crate::file::page_index::index::Index
776    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
777        if let Some(value) = max_length {
778            assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
779        }
780
781        self.column_index_truncate_length = max_length;
782        self
783    }
784
785    /// Sets the max length of min/max value fields in row group level
786    /// [`Statistics`] (defaults to `None`).
787    ///
788    /// # Notes
789    /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
790    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
791    ///
792    /// * If `Some`, must be greater than 0, otherwise will panic
793    /// * If `None`, there's no effective limit.
794    ///
795    /// [`Statistics`]: crate::file::statistics::Statistics
796    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
797        if let Some(value) = max_length {
798            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
799        }
800
801        self.statistics_truncate_length = max_length;
802        self
803    }
804
805    /// Should the writer coerce types to parquet native types (defaults to `false`).
806    ///
807    /// Leaving this option the default `false` will ensure the exact same data
808    /// written to parquet using this library will be read.
809    ///
810    /// Setting this option to `true` will result in parquet files that can be
811    /// read by more readers, but potentially lose information in the process.
812    ///
813    /// * Types such as [`DataType::Date64`], which have no direct corresponding
814    ///   Parquet type, may be stored with lower precision.
815    ///
816    /// * The internal field names of `List` and `Map` types will be renamed if
817    ///   necessary to match what is required by the newest Parquet specification.
818    ///
819    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
820    ///
821    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
822    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
823    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
824        self.coerce_types = coerce_types;
825        self
826    }
827
828    /// Sets FileEncryptionProperties.
829    #[cfg(feature = "encryption")]
830    pub fn with_file_encryption_properties(
831        mut self,
832        file_encryption_properties: FileEncryptionProperties,
833    ) -> Self {
834        self.file_encryption_properties = Some(file_encryption_properties);
835        self
836    }
837}
838
839/// Controls the level of statistics to be computed by the writer and stored in
840/// the parquet file.
841///
842/// Enabling statistics makes the resulting Parquet file larger and requires
843/// more time to read the parquet footer.
844///
845/// Statistics can be used to improve query performance by pruning row groups
846/// and pages during query execution if the query engine supports evaluating the
847/// predicate using the statistics.
848#[derive(Debug, Clone, Copy, Eq, PartialEq)]
849pub enum EnabledStatistics {
850    /// Compute no statistics.
851    None,
852    /// Compute column chunk-level statistics but not page-level.
853    ///
854    /// Setting this option will store one set of statistics for each relevant
855    /// column for each row group. The more row groups written, the more
856    /// statistics will be stored.
857    Chunk,
858    /// Compute page-level and column chunk-level statistics.
859    ///
860    /// Setting this option will store one set of statistics for each relevant
861    /// column for each page and row group. The more row groups and the more
862    /// pages written, the more statistics will be stored.
863    Page,
864}
865
866impl FromStr for EnabledStatistics {
867    type Err = String;
868
869    fn from_str(s: &str) -> Result<Self, Self::Err> {
870        match s {
871            "NONE" | "none" => Ok(EnabledStatistics::None),
872            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
873            "PAGE" | "page" => Ok(EnabledStatistics::Page),
874            _ => Err(format!("Invalid statistics arg: {}", s)),
875        }
876    }
877}
878
879impl Default for EnabledStatistics {
880    fn default() -> Self {
881        DEFAULT_STATISTICS_ENABLED
882    }
883}
884
885/// Controls the bloom filter to be computed by the writer.
886#[derive(Debug, Clone, PartialEq)]
887pub struct BloomFilterProperties {
888    /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
889    ///
890    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
891    ///
892    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
893    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
894    /// e.g. 0.1, 0.05, or 0.001 is recommended.
895    ///
896    /// Setting to very small number diminishes the value of the filter itself, as the bitset size is
897    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
898    /// be known in advance in order to largely reduce space usage.
899    pub fpp: f64,
900    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
901    ///
902    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
903    ///
904    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
905    /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller
906    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
907    /// anyway.
908    ///
909    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
910    pub ndv: u64,
911}
912
913impl Default for BloomFilterProperties {
914    fn default() -> Self {
915        BloomFilterProperties {
916            fpp: DEFAULT_BLOOM_FILTER_FPP,
917            ndv: DEFAULT_BLOOM_FILTER_NDV,
918        }
919    }
920}
921
922/// Container for column properties that can be changed as part of writer.
923///
924/// If a field is `None`, it means that no specific value has been set for this column,
925/// so some subsequent or default value must be used.
926#[derive(Debug, Clone, Default, PartialEq)]
927struct ColumnProperties {
928    encoding: Option<Encoding>,
929    codec: Option<Compression>,
930    dictionary_enabled: Option<bool>,
931    statistics_enabled: Option<EnabledStatistics>,
932    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
933    max_statistics_size: Option<usize>,
934    /// bloom filter related properties
935    bloom_filter_properties: Option<BloomFilterProperties>,
936}
937
938impl ColumnProperties {
939    /// Sets encoding for this column.
940    ///
941    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
942    /// In case when dictionary is enabled for a column, this value is considered to
943    /// be a fallback encoding.
944    ///
945    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
946    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
947    /// for a column.
948    fn set_encoding(&mut self, value: Encoding) {
949        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
950            panic!("Dictionary encoding can not be used as fallback encoding");
951        }
952        self.encoding = Some(value);
953    }
954
955    /// Sets compression codec for this column.
956    fn set_compression(&mut self, value: Compression) {
957        self.codec = Some(value);
958    }
959
960    /// Sets whether or not dictionary encoding is enabled for this column.
961    fn set_dictionary_enabled(&mut self, enabled: bool) {
962        self.dictionary_enabled = Some(enabled);
963    }
964
965    /// Sets the statistics level for this column.
966    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
967        self.statistics_enabled = Some(enabled);
968    }
969
970    /// Sets max size for statistics for this column.
971    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
972    #[allow(deprecated)]
973    fn set_max_statistics_size(&mut self, value: usize) {
974        self.max_statistics_size = Some(value);
975    }
976
977    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
978    /// otherwise it is a no-op.
979    /// If `value` is `false`, resets bloom filter properties to `None`.
980    fn set_bloom_filter_enabled(&mut self, value: bool) {
981        if value && self.bloom_filter_properties.is_none() {
982            self.bloom_filter_properties = Some(Default::default())
983        } else if !value {
984            self.bloom_filter_properties = None
985        }
986    }
987
988    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
989    /// bloom filter if not previously enabled.
990    ///
991    /// # Panics
992    ///
993    /// Panics if the `value` is not between 0 and 1 exclusive
994    fn set_bloom_filter_fpp(&mut self, value: f64) {
995        assert!(
996            value > 0. && value < 1.0,
997            "fpp must be between 0 and 1 exclusive, got {value}"
998        );
999
1000        self.bloom_filter_properties
1001            .get_or_insert_with(Default::default)
1002            .fpp = value;
1003    }
1004
1005    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1006    /// enables bloom filter if not previously enabled.
1007    fn set_bloom_filter_ndv(&mut self, value: u64) {
1008        self.bloom_filter_properties
1009            .get_or_insert_with(Default::default)
1010            .ndv = value;
1011    }
1012
1013    /// Returns optional encoding for this column.
1014    fn encoding(&self) -> Option<Encoding> {
1015        self.encoding
1016    }
1017
1018    /// Returns optional compression codec for this column.
1019    fn compression(&self) -> Option<Compression> {
1020        self.codec
1021    }
1022
1023    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1024    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1025    /// been provided.
1026    fn dictionary_enabled(&self) -> Option<bool> {
1027        self.dictionary_enabled
1028    }
1029
1030    /// Returns optional statistics level requested for this column. If result is `None`,
1031    /// then no setting has been provided.
1032    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1033        self.statistics_enabled
1034    }
1035
1036    /// Returns optional max size in bytes for statistics.
1037    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1038    fn max_statistics_size(&self) -> Option<usize> {
1039        #[allow(deprecated)]
1040        self.max_statistics_size
1041    }
1042
1043    /// Returns the bloom filter properties, or `None` if not enabled
1044    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1045        self.bloom_filter_properties.as_ref()
1046    }
1047}
1048
1049/// Reference counted reader properties.
1050pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1051
1052const DEFAULT_READ_BLOOM_FILTER: bool = false;
1053
1054/// Configuration settings for reading parquet files.
1055///
1056/// All properties are immutable and `Send` + `Sync`.
1057/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1058///
1059/// # Example
1060///
1061/// ```rust
1062/// use parquet::file::properties::ReaderProperties;
1063///
1064/// // Create properties with default configuration.
1065/// let props = ReaderProperties::builder().build();
1066///
1067/// // Use properties builder to set certain options and assemble the configuration.
1068/// let props = ReaderProperties::builder()
1069///     .set_backward_compatible_lz4(false)
1070///     .build();
1071/// ```
1072pub struct ReaderProperties {
1073    codec_options: CodecOptions,
1074    read_bloom_filter: bool,
1075}
1076
1077impl ReaderProperties {
1078    /// Returns builder for reader properties with default values.
1079    pub fn builder() -> ReaderPropertiesBuilder {
1080        ReaderPropertiesBuilder::with_defaults()
1081    }
1082
1083    /// Returns codec options.
1084    pub(crate) fn codec_options(&self) -> &CodecOptions {
1085        &self.codec_options
1086    }
1087
1088    /// Returns whether to read bloom filter
1089    pub(crate) fn read_bloom_filter(&self) -> bool {
1090        self.read_bloom_filter
1091    }
1092}
1093
1094/// Builder for parquet file reader configuration. See example on
1095/// [`ReaderProperties`]
1096pub struct ReaderPropertiesBuilder {
1097    codec_options_builder: CodecOptionsBuilder,
1098    read_bloom_filter: Option<bool>,
1099}
1100
1101/// Reader properties builder.
1102impl ReaderPropertiesBuilder {
1103    /// Returns default state of the builder.
1104    fn with_defaults() -> Self {
1105        Self {
1106            codec_options_builder: CodecOptionsBuilder::default(),
1107            read_bloom_filter: None,
1108        }
1109    }
1110
1111    /// Finalizes the configuration and returns immutable reader properties struct.
1112    pub fn build(self) -> ReaderProperties {
1113        ReaderProperties {
1114            codec_options: self.codec_options_builder.build(),
1115            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1116        }
1117    }
1118
1119    /// Enable/disable backward compatible LZ4.
1120    ///
1121    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1122    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1123    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1124    /// compatibility with files generated by older versions of parquet-cpp.
1125    ///
1126    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1127    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1128        self.codec_options_builder = self
1129            .codec_options_builder
1130            .set_backward_compatible_lz4(value);
1131        self
1132    }
1133
1134    /// Enable/disable reading bloom filter
1135    ///
1136    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1137    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1138    ///
1139    /// By default bloom filter is set to be read.
1140    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1141        self.read_bloom_filter = Some(value);
1142        self
1143    }
1144}
1145
1146#[cfg(test)]
1147mod tests {
1148    use super::*;
1149
1150    #[test]
1151    fn test_writer_version() {
1152        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1153        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1154    }
1155
1156    #[test]
1157    fn test_writer_properties_default_settings() {
1158        let props = WriterProperties::default();
1159        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1160        assert_eq!(
1161            props.dictionary_page_size_limit(),
1162            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1163        );
1164        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1165        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1166        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1167        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1168        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1169        assert_eq!(props.key_value_metadata(), None);
1170        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1171        assert_eq!(
1172            props.compression(&ColumnPath::from("col")),
1173            DEFAULT_COMPRESSION
1174        );
1175        assert_eq!(
1176            props.dictionary_enabled(&ColumnPath::from("col")),
1177            DEFAULT_DICTIONARY_ENABLED
1178        );
1179        assert_eq!(
1180            props.statistics_enabled(&ColumnPath::from("col")),
1181            DEFAULT_STATISTICS_ENABLED
1182        );
1183        assert!(props
1184            .bloom_filter_properties(&ColumnPath::from("col"))
1185            .is_none());
1186    }
1187
1188    #[test]
1189    fn test_writer_properties_dictionary_encoding() {
1190        // dictionary encoding is not configurable, and it should be the same for both
1191        // writer version 1 and 2.
1192        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1193            let props = WriterProperties::builder()
1194                .set_writer_version(*version)
1195                .build();
1196            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1197            assert_eq!(
1198                props.dictionary_data_page_encoding(),
1199                Encoding::RLE_DICTIONARY
1200            );
1201        }
1202    }
1203
1204    #[test]
1205    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1206    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1207        // Should panic when user specifies dictionary encoding as fallback encoding.
1208        WriterProperties::builder()
1209            .set_encoding(Encoding::PLAIN_DICTIONARY)
1210            .build();
1211    }
1212
1213    #[test]
1214    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1215    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1216        // Should panic when user specifies dictionary encoding as fallback encoding.
1217        WriterProperties::builder()
1218            .set_encoding(Encoding::RLE_DICTIONARY)
1219            .build();
1220    }
1221
1222    #[test]
1223    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1224    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1225        WriterProperties::builder()
1226            .set_dictionary_enabled(true)
1227            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1228            .build();
1229    }
1230
1231    #[test]
1232    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1233    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1234        WriterProperties::builder()
1235            .set_dictionary_enabled(false)
1236            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1237            .build();
1238    }
1239
1240    #[test]
1241    fn test_writer_properties_builder() {
1242        let props = WriterProperties::builder()
1243            // file settings
1244            .set_writer_version(WriterVersion::PARQUET_2_0)
1245            .set_data_page_size_limit(10)
1246            .set_dictionary_page_size_limit(20)
1247            .set_write_batch_size(30)
1248            .set_max_row_group_size(40)
1249            .set_created_by("default".to_owned())
1250            .set_key_value_metadata(Some(vec![KeyValue::new(
1251                "key".to_string(),
1252                "value".to_string(),
1253            )]))
1254            // global column settings
1255            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1256            .set_compression(Compression::GZIP(Default::default()))
1257            .set_dictionary_enabled(false)
1258            .set_statistics_enabled(EnabledStatistics::None)
1259            // specific column settings
1260            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1261            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1262            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1263            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1264            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1265            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1266            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1267            .build();
1268
1269        assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1270        assert_eq!(props.data_page_size_limit(), 10);
1271        assert_eq!(props.dictionary_page_size_limit(), 20);
1272        assert_eq!(props.write_batch_size(), 30);
1273        assert_eq!(props.max_row_group_size(), 40);
1274        assert_eq!(props.created_by(), "default");
1275        assert_eq!(
1276            props.key_value_metadata(),
1277            Some(&vec![
1278                KeyValue::new("key".to_string(), "value".to_string(),)
1279            ])
1280        );
1281
1282        assert_eq!(
1283            props.encoding(&ColumnPath::from("a")),
1284            Some(Encoding::DELTA_BINARY_PACKED)
1285        );
1286        assert_eq!(
1287            props.compression(&ColumnPath::from("a")),
1288            Compression::GZIP(Default::default())
1289        );
1290        assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1291        assert_eq!(
1292            props.statistics_enabled(&ColumnPath::from("a")),
1293            EnabledStatistics::None
1294        );
1295
1296        assert_eq!(
1297            props.encoding(&ColumnPath::from("col")),
1298            Some(Encoding::RLE)
1299        );
1300        assert_eq!(
1301            props.compression(&ColumnPath::from("col")),
1302            Compression::SNAPPY
1303        );
1304        assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1305        assert_eq!(
1306            props.statistics_enabled(&ColumnPath::from("col")),
1307            EnabledStatistics::Chunk
1308        );
1309        assert_eq!(
1310            props.bloom_filter_properties(&ColumnPath::from("col")),
1311            Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1312        );
1313    }
1314
1315    #[test]
1316    fn test_writer_properties_builder_partial_defaults() {
1317        let props = WriterProperties::builder()
1318            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1319            .set_compression(Compression::GZIP(Default::default()))
1320            .set_bloom_filter_enabled(true)
1321            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1322            .build();
1323
1324        assert_eq!(
1325            props.encoding(&ColumnPath::from("col")),
1326            Some(Encoding::RLE)
1327        );
1328        assert_eq!(
1329            props.compression(&ColumnPath::from("col")),
1330            Compression::GZIP(Default::default())
1331        );
1332        assert_eq!(
1333            props.dictionary_enabled(&ColumnPath::from("col")),
1334            DEFAULT_DICTIONARY_ENABLED
1335        );
1336        assert_eq!(
1337            props.bloom_filter_properties(&ColumnPath::from("col")),
1338            Some(&BloomFilterProperties {
1339                fpp: 0.05,
1340                ndv: 1_000_000_u64
1341            })
1342        );
1343    }
1344
1345    #[test]
1346    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1347        assert_eq!(
1348            WriterProperties::builder()
1349                .build()
1350                .bloom_filter_properties(&ColumnPath::from("col")),
1351            None
1352        );
1353        assert_eq!(
1354            WriterProperties::builder()
1355                .set_bloom_filter_ndv(100)
1356                .build()
1357                .bloom_filter_properties(&ColumnPath::from("col")),
1358            Some(&BloomFilterProperties {
1359                fpp: 0.05,
1360                ndv: 100
1361            })
1362        );
1363        assert_eq!(
1364            WriterProperties::builder()
1365                .set_bloom_filter_fpp(0.1)
1366                .build()
1367                .bloom_filter_properties(&ColumnPath::from("col")),
1368            Some(&BloomFilterProperties {
1369                fpp: 0.1,
1370                ndv: 1_000_000_u64
1371            })
1372        );
1373    }
1374
1375    #[test]
1376    fn test_reader_properties_default_settings() {
1377        let props = ReaderProperties::builder().build();
1378
1379        let codec_options = CodecOptionsBuilder::default()
1380            .set_backward_compatible_lz4(true)
1381            .build();
1382
1383        assert_eq!(props.codec_options(), &codec_options);
1384        assert!(!props.read_bloom_filter());
1385    }
1386
1387    #[test]
1388    fn test_reader_properties_builder() {
1389        let props = ReaderProperties::builder()
1390            .set_backward_compatible_lz4(false)
1391            .build();
1392
1393        let codec_options = CodecOptionsBuilder::default()
1394            .set_backward_compatible_lz4(false)
1395            .build();
1396
1397        assert_eq!(props.codec_options(), &codec_options);
1398    }
1399
1400    #[test]
1401    fn test_parse_writerversion() {
1402        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1403        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1404        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1405        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1406
1407        // test lowercase
1408        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1409        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1410
1411        // test invalid version
1412        match "PARQUET_-1_0".parse::<WriterVersion>() {
1413            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1414            Err(e) => {
1415                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1416            }
1417        }
1418    }
1419
1420    #[test]
1421    fn test_parse_enabledstatistics() {
1422        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1423        assert_eq!(enabled_statistics, EnabledStatistics::None);
1424        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1425        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1426        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1427        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1428
1429        // test lowercase
1430        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1431        assert_eq!(enabled_statistics, EnabledStatistics::None);
1432
1433        //test invalid statistics
1434        match "ChunkAndPage".parse::<EnabledStatistics>() {
1435            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1436            Err(e) => {
1437                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1438            }
1439        }
1440    }
1441}