parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21use crate::file::metadata::KeyValue;
22use crate::format::SortingColumn;
23use crate::schema::types::ColumnPath;
24use std::str::FromStr;
25use std::{collections::HashMap, sync::Arc};
26
27/// Default value for [`WriterProperties::data_page_size_limit`]
28pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
29/// Default value for [`WriterProperties::write_batch_size`]
30pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
31/// Default value for [`WriterProperties::writer_version`]
32pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
33/// Default value for [`WriterProperties::compression`]
34pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
35/// Default value for [`WriterProperties::dictionary_enabled`]
36pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
37/// Default value for [`WriterProperties::dictionary_page_size_limit`]
38pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
39/// Default value for [`WriterProperties::data_page_row_count_limit`]
40pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
41/// Default value for [`WriterProperties::statistics_enabled`]
42pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
43/// Default value for [`WriterProperties::max_statistics_size`]
44#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
45pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
46/// Default value for [`WriterProperties::max_row_group_size`]
47pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
60/// Default value for [`WriterProperties::offset_index_disabled`]
61pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
62/// Default values for [`WriterProperties::coerce_types`]
63pub const DEFAULT_COERCE_TYPES: bool = false;
64
65/// Parquet writer version.
66///
67/// Basic constant, which is not part of the Thrift definition.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69#[allow(non_camel_case_types)]
70pub enum WriterVersion {
71    /// Parquet format version 1.0
72    PARQUET_1_0,
73    /// Parquet format version 2.0
74    PARQUET_2_0,
75}
76
77impl WriterVersion {
78    /// Returns writer version as `i32`.
79    pub fn as_num(&self) -> i32 {
80        match self {
81            WriterVersion::PARQUET_1_0 => 1,
82            WriterVersion::PARQUET_2_0 => 2,
83        }
84    }
85}
86
87impl FromStr for WriterVersion {
88    type Err = String;
89
90    fn from_str(s: &str) -> Result<Self, Self::Err> {
91        match s {
92            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
93            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
94            _ => Err(format!("Invalid writer version: {}", s)),
95        }
96    }
97}
98
99/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
100/// write Bloom filters
101///
102/// Basic constant, which is not part of the Thrift definition.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum BloomFilterPosition {
105    /// Write Bloom Filters of each row group right after the row group
106    ///
107    /// This saves memory by writing it as soon as it is computed, at the cost
108    /// of data locality for readers
109    AfterRowGroup,
110    /// Write Bloom Filters at the end of the file
111    ///
112    /// This allows better data locality for readers, at the cost of memory usage
113    /// for writers.
114    End,
115}
116
117/// Reference counted writer properties.
118pub type WriterPropertiesPtr = Arc<WriterProperties>;
119
120/// Configuration settings for writing parquet files.
121///
122/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
123///
124/// # Example
125///
126/// ```rust
127/// # use parquet::{
128/// #    basic::{Compression, Encoding},
129/// #    file::properties::*,
130/// #    schema::types::ColumnPath,
131/// # };
132/// #
133/// // Create properties with default configuration.
134/// let props = WriterProperties::default();
135///
136/// // Use properties builder to set certain options and assemble the configuration.
137/// let props = WriterProperties::builder()
138///     .set_writer_version(WriterVersion::PARQUET_1_0)
139///     .set_encoding(Encoding::PLAIN)
140///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
141///     .set_compression(Compression::SNAPPY)
142///     .build();
143///
144/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
145/// assert_eq!(
146///     props.encoding(&ColumnPath::from("col1")),
147///     Some(Encoding::DELTA_BINARY_PACKED)
148/// );
149/// assert_eq!(
150///     props.encoding(&ColumnPath::from("col2")),
151///     Some(Encoding::PLAIN)
152/// );
153/// ```
154#[derive(Debug, Clone)]
155pub struct WriterProperties {
156    data_page_size_limit: usize,
157    dictionary_page_size_limit: usize,
158    data_page_row_count_limit: usize,
159    write_batch_size: usize,
160    max_row_group_size: usize,
161    bloom_filter_position: BloomFilterPosition,
162    writer_version: WriterVersion,
163    created_by: String,
164    offset_index_disabled: bool,
165    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
166    default_column_properties: ColumnProperties,
167    column_properties: HashMap<ColumnPath, ColumnProperties>,
168    sorting_columns: Option<Vec<SortingColumn>>,
169    column_index_truncate_length: Option<usize>,
170    statistics_truncate_length: Option<usize>,
171    coerce_types: bool,
172}
173
174impl Default for WriterProperties {
175    fn default() -> Self {
176        Self::builder().build()
177    }
178}
179
180impl WriterProperties {
181    /// Create a new [`WriterProperties`] with the default settings
182    ///
183    /// See [`WriterProperties::builder`] for customising settings
184    pub fn new() -> Self {
185        Self::default()
186    }
187
188    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
189    /// properties.
190    pub fn builder() -> WriterPropertiesBuilder {
191        WriterPropertiesBuilder::with_defaults()
192    }
193
194    /// Returns data page size limit.
195    ///
196    /// Note: this is a best effort limit based on the write batch size
197    ///
198    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
199    pub fn data_page_size_limit(&self) -> usize {
200        self.data_page_size_limit
201    }
202
203    /// Returns dictionary page size limit.
204    ///
205    /// Note: this is a best effort limit based on the write batch size
206    ///
207    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
208    pub fn dictionary_page_size_limit(&self) -> usize {
209        self.dictionary_page_size_limit
210    }
211
212    /// Returns the maximum page row count
213    ///
214    /// Note: this is a best effort limit based on the write batch size
215    ///
216    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
217    pub fn data_page_row_count_limit(&self) -> usize {
218        self.data_page_row_count_limit
219    }
220
221    /// Returns configured batch size for writes.
222    ///
223    /// When writing a batch of data, this setting allows to split it internally into
224    /// smaller batches so we can better estimate the size of a page currently being
225    /// written.
226    pub fn write_batch_size(&self) -> usize {
227        self.write_batch_size
228    }
229
230    /// Returns maximum number of rows in a row group.
231    pub fn max_row_group_size(&self) -> usize {
232        self.max_row_group_size
233    }
234
235    /// Returns maximum number of rows in a row group.
236    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
237        self.bloom_filter_position
238    }
239
240    /// Returns configured writer version.
241    pub fn writer_version(&self) -> WriterVersion {
242        self.writer_version
243    }
244
245    /// Returns `created_by` string.
246    pub fn created_by(&self) -> &str {
247        &self.created_by
248    }
249
250    /// Returns `true` if offset index writing is disabled.
251    pub fn offset_index_disabled(&self) -> bool {
252        // If page statistics are to be collected, then do not disable the offset indexes.
253        let default_page_stats_enabled =
254            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
255        let column_page_stats_enabled = self
256            .column_properties
257            .iter()
258            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
259        if default_page_stats_enabled || column_page_stats_enabled {
260            return false;
261        }
262
263        self.offset_index_disabled
264    }
265
266    /// Returns `key_value_metadata` KeyValue pairs.
267    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
268        self.key_value_metadata.as_ref()
269    }
270
271    /// Returns sorting columns.
272    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
273        self.sorting_columns.as_ref()
274    }
275
276    /// Returns the maximum length of truncated min/max values in the column index.
277    ///
278    /// `None` if truncation is disabled, must be greater than 0 otherwise.
279    pub fn column_index_truncate_length(&self) -> Option<usize> {
280        self.column_index_truncate_length
281    }
282
283    /// Returns the maximum length of truncated min/max values in statistics.
284    ///
285    /// `None` if truncation is disabled, must be greater than 0 otherwise.
286    pub fn statistics_truncate_length(&self) -> Option<usize> {
287        self.statistics_truncate_length
288    }
289
290    /// Returns `true` if type coercion is enabled.
291    pub fn coerce_types(&self) -> bool {
292        self.coerce_types
293    }
294
295    /// Returns encoding for a data page, when dictionary encoding is enabled.
296    /// This is not configurable.
297    #[inline]
298    pub fn dictionary_data_page_encoding(&self) -> Encoding {
299        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
300        // Dictionary values are encoded using RLE_DICTIONARY encoding.
301        Encoding::RLE_DICTIONARY
302    }
303
304    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
305    /// This is not configurable.
306    #[inline]
307    pub fn dictionary_page_encoding(&self) -> Encoding {
308        // PLAIN_DICTIONARY is deprecated in writer version 1.
309        // Dictionary is encoded using plain encoding.
310        Encoding::PLAIN
311    }
312
313    /// Returns encoding for a column, if set.
314    /// In case when dictionary is enabled, returns fallback encoding.
315    ///
316    /// If encoding is not set, then column writer will choose the best encoding
317    /// based on the column type.
318    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
319        self.column_properties
320            .get(col)
321            .and_then(|c| c.encoding())
322            .or_else(|| self.default_column_properties.encoding())
323    }
324
325    /// Returns compression codec for a column.
326    pub fn compression(&self, col: &ColumnPath) -> Compression {
327        self.column_properties
328            .get(col)
329            .and_then(|c| c.compression())
330            .or_else(|| self.default_column_properties.compression())
331            .unwrap_or(DEFAULT_COMPRESSION)
332    }
333
334    /// Returns `true` if dictionary encoding is enabled for a column.
335    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
336        self.column_properties
337            .get(col)
338            .and_then(|c| c.dictionary_enabled())
339            .or_else(|| self.default_column_properties.dictionary_enabled())
340            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
341    }
342
343    /// Returns which statistics are written for a column.
344    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
345        self.column_properties
346            .get(col)
347            .and_then(|c| c.statistics_enabled())
348            .or_else(|| self.default_column_properties.statistics_enabled())
349            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
350    }
351
352    /// Returns max size for statistics.
353    /// Only applicable if statistics are enabled.
354    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
355    pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
356        #[allow(deprecated)]
357        self.column_properties
358            .get(col)
359            .and_then(|c| c.max_statistics_size())
360            .or_else(|| self.default_column_properties.max_statistics_size())
361            .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
362    }
363
364    /// Returns the [`BloomFilterProperties`] for the given column
365    ///
366    /// Returns `None` if bloom filter is disabled
367    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
368        self.column_properties
369            .get(col)
370            .and_then(|c| c.bloom_filter_properties())
371            .or_else(|| self.default_column_properties.bloom_filter_properties())
372    }
373}
374
375/// Builder for  [`WriterProperties`] parquet writer configuration.
376///
377/// See example on [`WriterProperties`]
378pub struct WriterPropertiesBuilder {
379    data_page_size_limit: usize,
380    dictionary_page_size_limit: usize,
381    data_page_row_count_limit: usize,
382    write_batch_size: usize,
383    max_row_group_size: usize,
384    bloom_filter_position: BloomFilterPosition,
385    writer_version: WriterVersion,
386    created_by: String,
387    offset_index_disabled: bool,
388    key_value_metadata: Option<Vec<KeyValue>>,
389    default_column_properties: ColumnProperties,
390    column_properties: HashMap<ColumnPath, ColumnProperties>,
391    sorting_columns: Option<Vec<SortingColumn>>,
392    column_index_truncate_length: Option<usize>,
393    statistics_truncate_length: Option<usize>,
394    coerce_types: bool,
395}
396
397impl WriterPropertiesBuilder {
398    /// Returns default state of the builder.
399    fn with_defaults() -> Self {
400        Self {
401            data_page_size_limit: DEFAULT_PAGE_SIZE,
402            dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
403            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
404            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
405            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
406            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
407            writer_version: DEFAULT_WRITER_VERSION,
408            created_by: DEFAULT_CREATED_BY.to_string(),
409            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
410            key_value_metadata: None,
411            default_column_properties: Default::default(),
412            column_properties: HashMap::new(),
413            sorting_columns: None,
414            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
415            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
416            coerce_types: DEFAULT_COERCE_TYPES,
417        }
418    }
419
420    /// Finalizes the configuration and returns immutable writer properties struct.
421    pub fn build(self) -> WriterProperties {
422        WriterProperties {
423            data_page_size_limit: self.data_page_size_limit,
424            dictionary_page_size_limit: self.dictionary_page_size_limit,
425            data_page_row_count_limit: self.data_page_row_count_limit,
426            write_batch_size: self.write_batch_size,
427            max_row_group_size: self.max_row_group_size,
428            bloom_filter_position: self.bloom_filter_position,
429            writer_version: self.writer_version,
430            created_by: self.created_by,
431            offset_index_disabled: self.offset_index_disabled,
432            key_value_metadata: self.key_value_metadata,
433            default_column_properties: self.default_column_properties,
434            column_properties: self.column_properties,
435            sorting_columns: self.sorting_columns,
436            column_index_truncate_length: self.column_index_truncate_length,
437            statistics_truncate_length: self.statistics_truncate_length,
438            coerce_types: self.coerce_types,
439        }
440    }
441
442    // ----------------------------------------------------------------------
443    // Writer properties related to a file
444
445    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
446    ///
447    /// This value can determine what features some readers will support.
448    ///
449    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
450    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
451        self.writer_version = value;
452        self
453    }
454
455    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
456    ///
457    /// The parquet writer will attempt to limit the sizes of each
458    /// `DataPage` to this many bytes. Reducing this value will result
459    /// in larger parquet files, but may improve the effectiveness of
460    /// page index based predicate pushdown during reading.
461    ///
462    /// Note: this is a best effort limit based on value of
463    /// [`set_write_batch_size`](Self::set_write_batch_size).
464    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
465        self.data_page_size_limit = value;
466        self
467    }
468
469    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
470    ///
471    /// The parquet writer will attempt to limit the number of rows in
472    /// each `DataPage` to this value. Reducing this value will result
473    /// in larger parquet files, but may improve the effectiveness of
474    /// page index based predicate pushdown during reading.
475    ///
476    /// Note: this is a best effort limit based on value of
477    /// [`set_write_batch_size`](Self::set_write_batch_size).
478    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
479        self.data_page_row_count_limit = value;
480        self
481    }
482
483    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
484    ///
485    /// The parquet writer will attempt to limit the size of each
486    /// `DataPage` used to store dictionaries to this many
487    /// bytes. Reducing this value will result in larger parquet
488    /// files, but may improve the effectiveness of page index based
489    /// predicate pushdown during reading.
490    ///
491    /// Note: this is a best effort limit based on value of
492    /// [`set_write_batch_size`](Self::set_write_batch_size).
493    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
494        self.dictionary_page_size_limit = value;
495        self
496    }
497
498    /// Sets write batch size (defaults to 1024).
499    ///
500    /// For performance reasons, data for each column is written in
501    /// batches of this size.
502    ///
503    /// Additional limits such as such as
504    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
505    /// are checked between batches, and thus the write batch size value acts as an
506    /// upper-bound on the enforcement granularity of other limits.
507    pub fn set_write_batch_size(mut self, value: usize) -> Self {
508        self.write_batch_size = value;
509        self
510    }
511
512    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
513    ///
514    /// # Panics
515    /// If the value is set to 0.
516    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
517        assert!(value > 0, "Cannot have a 0 max row group size");
518        self.max_row_group_size = value;
519        self
520    }
521
522    /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
523    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
524        self.bloom_filter_position = value;
525        self
526    }
527
528    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
529    pub fn set_created_by(mut self, value: String) -> Self {
530        self.created_by = value;
531        self
532    }
533
534    /// Sets whether the writing of offset indexes is disabled (defaults to `false`).
535    ///
536    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
537    ///
538    /// Note: As the offset indexes are useful for accessing data by row number,
539    /// they are always written by default, regardless of whether other statistics
540    /// are enabled. Disabling this metadata may result in a degradation in read
541    /// performance, so use this option with care.
542    ///
543    /// [`Page`]: EnabledStatistics::Page
544    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
545        self.offset_index_disabled = value;
546        self
547    }
548
549    /// Sets "key_value_metadata" property (defaults to `None`).
550    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
551        self.key_value_metadata = value;
552        self
553    }
554
555    /// Sets sorting order of rows in the row group if any (defaults to `None`).
556    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
557        self.sorting_columns = value;
558        self
559    }
560
561    // ----------------------------------------------------------------------
562    // Setters for any column (global)
563
564    /// Sets default encoding for all columns.
565    ///
566    /// If dictionary is not enabled, this is treated as a primary encoding for all
567    /// columns. In case when dictionary is enabled for any column, this value is
568    /// considered to be a fallback encoding for that column.
569    ///
570    /// # Panics
571    ///
572    /// if dictionary encoding is specified, regardless of dictionary
573    /// encoding flag being set.
574    pub fn set_encoding(mut self, value: Encoding) -> Self {
575        self.default_column_properties.set_encoding(value);
576        self
577    }
578
579    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
580    ///
581    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
582    pub fn set_compression(mut self, value: Compression) -> Self {
583        self.default_column_properties.set_compression(value);
584        self
585    }
586
587    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
588    ///
589    /// Use this method to set dictionary encoding, instead of explicitly specifying
590    /// encoding in `set_encoding` method.
591    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
592        self.default_column_properties.set_dictionary_enabled(value);
593        self
594    }
595
596    /// Sets default statistics level for all columns (defaults to [`Page`]).
597    ///
598    /// [`Page`]: EnabledStatistics::Page
599    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
600        self.default_column_properties.set_statistics_enabled(value);
601        self
602    }
603
604    /// Sets default max statistics size for all columns (defaults to `4096`).
605    ///
606    /// Applicable only if statistics are enabled.
607    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
608    pub fn set_max_statistics_size(mut self, value: usize) -> Self {
609        #[allow(deprecated)]
610        self.default_column_properties
611            .set_max_statistics_size(value);
612        self
613    }
614
615    /// Sets if bloom filter is enabled by default for all columns (defaults to `false`).
616    ///
617    /// # Notes
618    ///
619    /// * If the bloom filter is enabled previously then it is a no-op.
620    ///
621    /// * If the bloom filter is not enabled, default values for ndv and fpp
622    ///   value are used used. See [`set_bloom_filter_ndv`] and
623    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
624    ///
625    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
626    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
627    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
628        self.default_column_properties
629            .set_bloom_filter_enabled(value);
630        self
631    }
632
633    /// Sets the default target bloom filter false positive probability (fpp)
634    /// for all columns (defaults to `0.05`).
635    ///
636    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
637    /// been called.
638    ///
639    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
640    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
641        self.default_column_properties.set_bloom_filter_fpp(value);
642        self
643    }
644
645    /// Sets default number of distinct values (ndv) for bloom filter for all
646    /// columns (defaults to `1_000_000`).
647    ///
648    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
649    /// been called.
650    ///
651    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
652    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
653        self.default_column_properties.set_bloom_filter_ndv(value);
654        self
655    }
656
657    // ----------------------------------------------------------------------
658    // Setters for a specific column
659
660    /// Helper method to get existing or new mutable reference of column properties.
661    #[inline]
662    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
663        self.column_properties.entry(col).or_default()
664    }
665
666    /// Sets encoding for a specific column.
667    ///
668    /// Takes precedence over [`Self::set_encoding`].
669    ///
670    /// If dictionary is not enabled, this is treated as a primary encoding for this
671    /// column. In case when dictionary is enabled for this column, either through
672    /// global defaults or explicitly, this value is considered to be a fallback
673    /// encoding for this column.
674    ///
675    /// # Panics
676    /// If user tries to set dictionary encoding here, regardless of dictionary
677    /// encoding flag being set.
678    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
679        self.get_mut_props(col).set_encoding(value);
680        self
681    }
682
683    /// Sets compression codec for a specific column.
684    ///
685    /// Takes precedence over [`Self::set_compression`].
686    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
687        self.get_mut_props(col).set_compression(value);
688        self
689    }
690
691    /// Sets flag to enable/disable dictionary encoding for a specific column.
692    ///
693    /// Takes precedence over [`Self::set_dictionary_enabled`].
694    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
695        self.get_mut_props(col).set_dictionary_enabled(value);
696        self
697    }
698
699    /// Sets statistics level for a specific column.
700    ///
701    /// Takes precedence over [`Self::set_statistics_enabled`].
702    pub fn set_column_statistics_enabled(
703        mut self,
704        col: ColumnPath,
705        value: EnabledStatistics,
706    ) -> Self {
707        self.get_mut_props(col).set_statistics_enabled(value);
708        self
709    }
710
711    /// Sets max size for statistics for a specific column.
712    ///
713    /// Takes precedence over [`Self::set_max_statistics_size`].
714    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
715    pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
716        #[allow(deprecated)]
717        self.get_mut_props(col).set_max_statistics_size(value);
718        self
719    }
720
721    /// Sets whether a bloom filter should be written for a specific column.
722    ///
723    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
724    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
725        self.get_mut_props(col).set_bloom_filter_enabled(value);
726        self
727    }
728
729    /// Sets the false positive probability for bloom filter for a specific column.
730    ///
731    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
732    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
733        self.get_mut_props(col).set_bloom_filter_fpp(value);
734        self
735    }
736
737    /// Sets the number of distinct values for bloom filter for a specific column.
738    ///
739    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
740    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
741        self.get_mut_props(col).set_bloom_filter_ndv(value);
742        self
743    }
744
745    /// Sets the max length of min/max value fields when writing the column
746    /// [`Index`] (defaults to `None`).
747    ///
748    /// This can be used to prevent columns with very long values (hundreds of
749    /// bytes long) from causing the parquet metadata to become huge.
750    ///
751    /// # Notes
752    ///
753    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
754    /// set to [`EnabledStatistics::Page`].
755    ///
756    /// * If `Some`, must be greater than 0, otherwise will panic
757    /// * If `None`, there's no effective limit.
758    ///
759    /// [`Index`]: crate::file::page_index::index::Index
760    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
761        if let Some(value) = max_length {
762            assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
763        }
764
765        self.column_index_truncate_length = max_length;
766        self
767    }
768
769    /// Sets the max length of min/max value fields in row group level
770    /// [`Statistics`] (defaults to `None`).
771    ///
772    /// # Notes
773    /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
774    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
775    ///
776    /// * If `Some`, must be greater than 0, otherwise will panic
777    /// * If `None`, there's no effective limit.
778    ///
779    /// [`Statistics`]: crate::file::statistics::Statistics
780    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
781        if let Some(value) = max_length {
782            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
783        }
784
785        self.statistics_truncate_length = max_length;
786        self
787    }
788
789    /// Should the writer coerce types to parquet native types (defaults to `false`).
790    ///
791    /// Leaving this option the default `false` will ensure the exact same data
792    /// written to parquet using this library will be read.
793    ///
794    /// Setting this option to `true` will result in parquet files that can be
795    /// read by more readers, but potentially lose information in the process.
796    ///
797    /// * Types such as [`DataType::Date64`], which have no direct corresponding
798    ///   Parquet type, may be stored with lower precision.
799    ///
800    /// * The internal field names of `List` and `Map` types will be renamed if
801    ///   necessary to match what is required by the newest Parquet specification.
802    ///
803    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
804    ///
805    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
806    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
807    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
808        self.coerce_types = coerce_types;
809        self
810    }
811}
812
813/// Controls the level of statistics to be computed by the writer and stored in
814/// the parquet file.
815///
816/// Enabling statistics makes the resulting Parquet file larger and requires
817/// more time to read the parquet footer.
818///
819/// Statistics can be used to improve query performance by pruning row groups
820/// and pages during query execution if the query engine supports evaluating the
821/// predicate using the statistics.
822#[derive(Debug, Clone, Copy, Eq, PartialEq)]
823pub enum EnabledStatistics {
824    /// Compute no statistics.
825    None,
826    /// Compute column chunk-level statistics but not page-level.
827    ///
828    /// Setting this option will store one set of statistics for each relevant
829    /// column for each row group. The more row groups written, the more
830    /// statistics will be stored.
831    Chunk,
832    /// Compute page-level and column chunk-level statistics.
833    ///
834    /// Setting this option will store one set of statistics for each relevant
835    /// column for each page and row group. The more row groups and the more
836    /// pages written, the more statistics will be stored.
837    Page,
838}
839
840impl FromStr for EnabledStatistics {
841    type Err = String;
842
843    fn from_str(s: &str) -> Result<Self, Self::Err> {
844        match s {
845            "NONE" | "none" => Ok(EnabledStatistics::None),
846            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
847            "PAGE" | "page" => Ok(EnabledStatistics::Page),
848            _ => Err(format!("Invalid statistics arg: {}", s)),
849        }
850    }
851}
852
853impl Default for EnabledStatistics {
854    fn default() -> Self {
855        DEFAULT_STATISTICS_ENABLED
856    }
857}
858
859/// Controls the bloom filter to be computed by the writer.
860#[derive(Debug, Clone, PartialEq)]
861pub struct BloomFilterProperties {
862    /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
863    ///
864    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
865    ///
866    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
867    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
868    /// e.g. 0.1, 0.05, or 0.001 is recommended.
869    ///
870    /// Setting to very small number diminishes the value of the filter itself, as the bitset size is
871    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
872    /// be known in advance in order to largely reduce space usage.
873    pub fpp: f64,
874    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
875    ///
876    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
877    ///
878    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
879    /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller
880    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
881    /// anyway.
882    ///
883    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
884    pub ndv: u64,
885}
886
887impl Default for BloomFilterProperties {
888    fn default() -> Self {
889        BloomFilterProperties {
890            fpp: DEFAULT_BLOOM_FILTER_FPP,
891            ndv: DEFAULT_BLOOM_FILTER_NDV,
892        }
893    }
894}
895
896/// Container for column properties that can be changed as part of writer.
897///
898/// If a field is `None`, it means that no specific value has been set for this column,
899/// so some subsequent or default value must be used.
900#[derive(Debug, Clone, Default, PartialEq)]
901struct ColumnProperties {
902    encoding: Option<Encoding>,
903    codec: Option<Compression>,
904    dictionary_enabled: Option<bool>,
905    statistics_enabled: Option<EnabledStatistics>,
906    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
907    max_statistics_size: Option<usize>,
908    /// bloom filter related properties
909    bloom_filter_properties: Option<BloomFilterProperties>,
910}
911
912impl ColumnProperties {
913    /// Sets encoding for this column.
914    ///
915    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
916    /// In case when dictionary is enabled for a column, this value is considered to
917    /// be a fallback encoding.
918    ///
919    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
920    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
921    /// for a column.
922    fn set_encoding(&mut self, value: Encoding) {
923        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
924            panic!("Dictionary encoding can not be used as fallback encoding");
925        }
926        self.encoding = Some(value);
927    }
928
929    /// Sets compression codec for this column.
930    fn set_compression(&mut self, value: Compression) {
931        self.codec = Some(value);
932    }
933
934    /// Sets whether or not dictionary encoding is enabled for this column.
935    fn set_dictionary_enabled(&mut self, enabled: bool) {
936        self.dictionary_enabled = Some(enabled);
937    }
938
939    /// Sets the statistics level for this column.
940    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
941        self.statistics_enabled = Some(enabled);
942    }
943
944    /// Sets max size for statistics for this column.
945    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
946    #[allow(deprecated)]
947    fn set_max_statistics_size(&mut self, value: usize) {
948        self.max_statistics_size = Some(value);
949    }
950
951    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
952    /// otherwise it is a no-op.
953    /// If `value` is `false`, resets bloom filter properties to `None`.
954    fn set_bloom_filter_enabled(&mut self, value: bool) {
955        if value && self.bloom_filter_properties.is_none() {
956            self.bloom_filter_properties = Some(Default::default())
957        } else if !value {
958            self.bloom_filter_properties = None
959        }
960    }
961
962    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
963    /// bloom filter if not previously enabled.
964    ///
965    /// # Panics
966    ///
967    /// Panics if the `value` is not between 0 and 1 exclusive
968    fn set_bloom_filter_fpp(&mut self, value: f64) {
969        assert!(
970            value > 0. && value < 1.0,
971            "fpp must be between 0 and 1 exclusive, got {value}"
972        );
973
974        self.bloom_filter_properties
975            .get_or_insert_with(Default::default)
976            .fpp = value;
977    }
978
979    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
980    /// enables bloom filter if not previously enabled.
981    fn set_bloom_filter_ndv(&mut self, value: u64) {
982        self.bloom_filter_properties
983            .get_or_insert_with(Default::default)
984            .ndv = value;
985    }
986
987    /// Returns optional encoding for this column.
988    fn encoding(&self) -> Option<Encoding> {
989        self.encoding
990    }
991
992    /// Returns optional compression codec for this column.
993    fn compression(&self) -> Option<Compression> {
994        self.codec
995    }
996
997    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
998    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
999    /// been provided.
1000    fn dictionary_enabled(&self) -> Option<bool> {
1001        self.dictionary_enabled
1002    }
1003
1004    /// Returns optional statistics level requested for this column. If result is `None`,
1005    /// then no setting has been provided.
1006    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1007        self.statistics_enabled
1008    }
1009
1010    /// Returns optional max size in bytes for statistics.
1011    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1012    fn max_statistics_size(&self) -> Option<usize> {
1013        #[allow(deprecated)]
1014        self.max_statistics_size
1015    }
1016
1017    /// Returns the bloom filter properties, or `None` if not enabled
1018    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1019        self.bloom_filter_properties.as_ref()
1020    }
1021}
1022
1023/// Reference counted reader properties.
1024pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1025
1026const DEFAULT_READ_BLOOM_FILTER: bool = false;
1027
1028/// Configuration settings for reading parquet files.
1029///
1030/// All properties are immutable and `Send` + `Sync`.
1031/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1032///
1033/// # Example
1034///
1035/// ```rust
1036/// use parquet::file::properties::ReaderProperties;
1037///
1038/// // Create properties with default configuration.
1039/// let props = ReaderProperties::builder().build();
1040///
1041/// // Use properties builder to set certain options and assemble the configuration.
1042/// let props = ReaderProperties::builder()
1043///     .set_backward_compatible_lz4(false)
1044///     .build();
1045/// ```
1046pub struct ReaderProperties {
1047    codec_options: CodecOptions,
1048    read_bloom_filter: bool,
1049}
1050
1051impl ReaderProperties {
1052    /// Returns builder for reader properties with default values.
1053    pub fn builder() -> ReaderPropertiesBuilder {
1054        ReaderPropertiesBuilder::with_defaults()
1055    }
1056
1057    /// Returns codec options.
1058    pub(crate) fn codec_options(&self) -> &CodecOptions {
1059        &self.codec_options
1060    }
1061
1062    /// Returns whether to read bloom filter
1063    pub(crate) fn read_bloom_filter(&self) -> bool {
1064        self.read_bloom_filter
1065    }
1066}
1067
1068/// Builder for parquet file reader configuration. See example on
1069/// [`ReaderProperties`]
1070pub struct ReaderPropertiesBuilder {
1071    codec_options_builder: CodecOptionsBuilder,
1072    read_bloom_filter: Option<bool>,
1073}
1074
1075/// Reader properties builder.
1076impl ReaderPropertiesBuilder {
1077    /// Returns default state of the builder.
1078    fn with_defaults() -> Self {
1079        Self {
1080            codec_options_builder: CodecOptionsBuilder::default(),
1081            read_bloom_filter: None,
1082        }
1083    }
1084
1085    /// Finalizes the configuration and returns immutable reader properties struct.
1086    pub fn build(self) -> ReaderProperties {
1087        ReaderProperties {
1088            codec_options: self.codec_options_builder.build(),
1089            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1090        }
1091    }
1092
1093    /// Enable/disable backward compatible LZ4.
1094    ///
1095    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1096    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1097    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1098    /// compatibility with files generated by older versions of parquet-cpp.
1099    ///
1100    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1101    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1102        self.codec_options_builder = self
1103            .codec_options_builder
1104            .set_backward_compatible_lz4(value);
1105        self
1106    }
1107
1108    /// Enable/disable reading bloom filter
1109    ///
1110    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1111    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1112    ///
1113    /// By default bloom filter is set to be read.
1114    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1115        self.read_bloom_filter = Some(value);
1116        self
1117    }
1118}
1119
1120#[cfg(test)]
1121mod tests {
1122    use super::*;
1123
1124    #[test]
1125    fn test_writer_version() {
1126        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1127        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1128    }
1129
1130    #[test]
1131    fn test_writer_properties_default_settings() {
1132        let props = WriterProperties::default();
1133        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1134        assert_eq!(
1135            props.dictionary_page_size_limit(),
1136            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1137        );
1138        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1139        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1140        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1141        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1142        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1143        assert_eq!(props.key_value_metadata(), None);
1144        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1145        assert_eq!(
1146            props.compression(&ColumnPath::from("col")),
1147            DEFAULT_COMPRESSION
1148        );
1149        assert_eq!(
1150            props.dictionary_enabled(&ColumnPath::from("col")),
1151            DEFAULT_DICTIONARY_ENABLED
1152        );
1153        assert_eq!(
1154            props.statistics_enabled(&ColumnPath::from("col")),
1155            DEFAULT_STATISTICS_ENABLED
1156        );
1157        assert!(props
1158            .bloom_filter_properties(&ColumnPath::from("col"))
1159            .is_none());
1160    }
1161
1162    #[test]
1163    fn test_writer_properties_dictionary_encoding() {
1164        // dictionary encoding is not configurable, and it should be the same for both
1165        // writer version 1 and 2.
1166        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1167            let props = WriterProperties::builder()
1168                .set_writer_version(*version)
1169                .build();
1170            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1171            assert_eq!(
1172                props.dictionary_data_page_encoding(),
1173                Encoding::RLE_DICTIONARY
1174            );
1175        }
1176    }
1177
1178    #[test]
1179    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1180    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1181        // Should panic when user specifies dictionary encoding as fallback encoding.
1182        WriterProperties::builder()
1183            .set_encoding(Encoding::PLAIN_DICTIONARY)
1184            .build();
1185    }
1186
1187    #[test]
1188    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1189    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1190        // Should panic when user specifies dictionary encoding as fallback encoding.
1191        WriterProperties::builder()
1192            .set_encoding(Encoding::RLE_DICTIONARY)
1193            .build();
1194    }
1195
1196    #[test]
1197    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1198    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1199        WriterProperties::builder()
1200            .set_dictionary_enabled(true)
1201            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1202            .build();
1203    }
1204
1205    #[test]
1206    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1207    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1208        WriterProperties::builder()
1209            .set_dictionary_enabled(false)
1210            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1211            .build();
1212    }
1213
1214    #[test]
1215    fn test_writer_properties_builder() {
1216        let props = WriterProperties::builder()
1217            // file settings
1218            .set_writer_version(WriterVersion::PARQUET_2_0)
1219            .set_data_page_size_limit(10)
1220            .set_dictionary_page_size_limit(20)
1221            .set_write_batch_size(30)
1222            .set_max_row_group_size(40)
1223            .set_created_by("default".to_owned())
1224            .set_key_value_metadata(Some(vec![KeyValue::new(
1225                "key".to_string(),
1226                "value".to_string(),
1227            )]))
1228            // global column settings
1229            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1230            .set_compression(Compression::GZIP(Default::default()))
1231            .set_dictionary_enabled(false)
1232            .set_statistics_enabled(EnabledStatistics::None)
1233            // specific column settings
1234            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1235            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1236            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1237            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1238            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1239            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1240            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1241            .build();
1242
1243        assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1244        assert_eq!(props.data_page_size_limit(), 10);
1245        assert_eq!(props.dictionary_page_size_limit(), 20);
1246        assert_eq!(props.write_batch_size(), 30);
1247        assert_eq!(props.max_row_group_size(), 40);
1248        assert_eq!(props.created_by(), "default");
1249        assert_eq!(
1250            props.key_value_metadata(),
1251            Some(&vec![
1252                KeyValue::new("key".to_string(), "value".to_string(),)
1253            ])
1254        );
1255
1256        assert_eq!(
1257            props.encoding(&ColumnPath::from("a")),
1258            Some(Encoding::DELTA_BINARY_PACKED)
1259        );
1260        assert_eq!(
1261            props.compression(&ColumnPath::from("a")),
1262            Compression::GZIP(Default::default())
1263        );
1264        assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1265        assert_eq!(
1266            props.statistics_enabled(&ColumnPath::from("a")),
1267            EnabledStatistics::None
1268        );
1269
1270        assert_eq!(
1271            props.encoding(&ColumnPath::from("col")),
1272            Some(Encoding::RLE)
1273        );
1274        assert_eq!(
1275            props.compression(&ColumnPath::from("col")),
1276            Compression::SNAPPY
1277        );
1278        assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1279        assert_eq!(
1280            props.statistics_enabled(&ColumnPath::from("col")),
1281            EnabledStatistics::Chunk
1282        );
1283        assert_eq!(
1284            props.bloom_filter_properties(&ColumnPath::from("col")),
1285            Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1286        );
1287    }
1288
1289    #[test]
1290    fn test_writer_properties_builder_partial_defaults() {
1291        let props = WriterProperties::builder()
1292            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1293            .set_compression(Compression::GZIP(Default::default()))
1294            .set_bloom_filter_enabled(true)
1295            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1296            .build();
1297
1298        assert_eq!(
1299            props.encoding(&ColumnPath::from("col")),
1300            Some(Encoding::RLE)
1301        );
1302        assert_eq!(
1303            props.compression(&ColumnPath::from("col")),
1304            Compression::GZIP(Default::default())
1305        );
1306        assert_eq!(
1307            props.dictionary_enabled(&ColumnPath::from("col")),
1308            DEFAULT_DICTIONARY_ENABLED
1309        );
1310        assert_eq!(
1311            props.bloom_filter_properties(&ColumnPath::from("col")),
1312            Some(&BloomFilterProperties {
1313                fpp: 0.05,
1314                ndv: 1_000_000_u64
1315            })
1316        );
1317    }
1318
1319    #[test]
1320    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1321        assert_eq!(
1322            WriterProperties::builder()
1323                .build()
1324                .bloom_filter_properties(&ColumnPath::from("col")),
1325            None
1326        );
1327        assert_eq!(
1328            WriterProperties::builder()
1329                .set_bloom_filter_ndv(100)
1330                .build()
1331                .bloom_filter_properties(&ColumnPath::from("col")),
1332            Some(&BloomFilterProperties {
1333                fpp: 0.05,
1334                ndv: 100
1335            })
1336        );
1337        assert_eq!(
1338            WriterProperties::builder()
1339                .set_bloom_filter_fpp(0.1)
1340                .build()
1341                .bloom_filter_properties(&ColumnPath::from("col")),
1342            Some(&BloomFilterProperties {
1343                fpp: 0.1,
1344                ndv: 1_000_000_u64
1345            })
1346        );
1347    }
1348
1349    #[test]
1350    fn test_reader_properties_default_settings() {
1351        let props = ReaderProperties::builder().build();
1352
1353        let codec_options = CodecOptionsBuilder::default()
1354            .set_backward_compatible_lz4(true)
1355            .build();
1356
1357        assert_eq!(props.codec_options(), &codec_options);
1358        assert!(!props.read_bloom_filter());
1359    }
1360
1361    #[test]
1362    fn test_reader_properties_builder() {
1363        let props = ReaderProperties::builder()
1364            .set_backward_compatible_lz4(false)
1365            .build();
1366
1367        let codec_options = CodecOptionsBuilder::default()
1368            .set_backward_compatible_lz4(false)
1369            .build();
1370
1371        assert_eq!(props.codec_options(), &codec_options);
1372    }
1373
1374    #[test]
1375    fn test_parse_writerversion() {
1376        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1377        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1378        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1379        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1380
1381        // test lowercase
1382        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1383        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1384
1385        // test invalid version
1386        match "PARQUET_-1_0".parse::<WriterVersion>() {
1387            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1388            Err(e) => {
1389                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1390            }
1391        }
1392    }
1393
1394    #[test]
1395    fn test_parse_enabledstatistics() {
1396        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1397        assert_eq!(enabled_statistics, EnabledStatistics::None);
1398        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1399        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1400        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1401        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1402
1403        // test lowercase
1404        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1405        assert_eq!(enabled_statistics, EnabledStatistics::None);
1406
1407        //test invalid statistics
1408        match "ChunkAndPage".parse::<EnabledStatistics>() {
1409            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1410            Err(e) => {
1411                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1412            }
1413        }
1414    }
1415}