parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::write_page_header_statistics`]
46pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
47/// Default value for [`WriterProperties::max_row_group_size`]
48pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
49/// Default value for [`WriterProperties::bloom_filter_position`]
50pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
51/// Default value for [`WriterProperties::created_by`]
52pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
53/// Default value for [`WriterProperties::column_index_truncate_length`]
54pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
55/// Default value for [`BloomFilterProperties::fpp`]
56pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
57/// Default value for [`BloomFilterProperties::ndv`]
58pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
59/// Default values for [`WriterProperties::statistics_truncate_length`]
60pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
61/// Default value for [`WriterProperties::offset_index_disabled`]
62pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
63/// Default values for [`WriterProperties::coerce_types`]
64pub const DEFAULT_COERCE_TYPES: bool = false;
65
66/// Parquet writer version.
67///
68/// Basic constant, which is not part of the Thrift definition.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70#[allow(non_camel_case_types)]
71pub enum WriterVersion {
72    /// Parquet format version 1.0
73    PARQUET_1_0,
74    /// Parquet format version 2.0
75    PARQUET_2_0,
76}
77
78impl WriterVersion {
79    /// Returns writer version as `i32`.
80    pub fn as_num(&self) -> i32 {
81        match self {
82            WriterVersion::PARQUET_1_0 => 1,
83            WriterVersion::PARQUET_2_0 => 2,
84        }
85    }
86}
87
88impl FromStr for WriterVersion {
89    type Err = String;
90
91    fn from_str(s: &str) -> Result<Self, Self::Err> {
92        match s {
93            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
94            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
95            _ => Err(format!("Invalid writer version: {s}")),
96        }
97    }
98}
99
100/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
101/// write Bloom filters
102///
103/// Basic constant, which is not part of the Thrift definition.
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum BloomFilterPosition {
106    /// Write Bloom Filters of each row group right after the row group
107    ///
108    /// This saves memory by writing it as soon as it is computed, at the cost
109    /// of data locality for readers
110    AfterRowGroup,
111    /// Write Bloom Filters at the end of the file
112    ///
113    /// This allows better data locality for readers, at the cost of memory usage
114    /// for writers.
115    End,
116}
117
118/// Reference counted writer properties.
119pub type WriterPropertiesPtr = Arc<WriterProperties>;
120
121/// Configuration settings for writing parquet files.
122///
123/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
124///
125/// # Example
126///
127/// ```rust
128/// # use parquet::{
129/// #    basic::{Compression, Encoding},
130/// #    file::properties::*,
131/// #    schema::types::ColumnPath,
132/// # };
133/// #
134/// // Create properties with default configuration.
135/// let props = WriterProperties::default();
136///
137/// // Use properties builder to set certain options and assemble the configuration.
138/// let props = WriterProperties::builder()
139///     .set_writer_version(WriterVersion::PARQUET_1_0)
140///     .set_encoding(Encoding::PLAIN)
141///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
142///     .set_compression(Compression::SNAPPY)
143///     .build();
144///
145/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
146/// assert_eq!(
147///     props.encoding(&ColumnPath::from("col1")),
148///     Some(Encoding::DELTA_BINARY_PACKED)
149/// );
150/// assert_eq!(
151///     props.encoding(&ColumnPath::from("col2")),
152///     Some(Encoding::PLAIN)
153/// );
154/// ```
155#[derive(Debug, Clone)]
156pub struct WriterProperties {
157    data_page_size_limit: usize,
158    data_page_row_count_limit: usize,
159    write_batch_size: usize,
160    max_row_group_size: usize,
161    bloom_filter_position: BloomFilterPosition,
162    writer_version: WriterVersion,
163    created_by: String,
164    offset_index_disabled: bool,
165    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
166    default_column_properties: ColumnProperties,
167    column_properties: HashMap<ColumnPath, ColumnProperties>,
168    sorting_columns: Option<Vec<SortingColumn>>,
169    column_index_truncate_length: Option<usize>,
170    statistics_truncate_length: Option<usize>,
171    coerce_types: bool,
172    #[cfg(feature = "encryption")]
173    pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
174}
175
176impl Default for WriterProperties {
177    fn default() -> Self {
178        Self::builder().build()
179    }
180}
181
182impl WriterProperties {
183    /// Create a new [`WriterProperties`] with the default settings
184    ///
185    /// See [`WriterProperties::builder`] for customising settings
186    pub fn new() -> Self {
187        Self::default()
188    }
189
190    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
191    /// properties.
192    pub fn builder() -> WriterPropertiesBuilder {
193        WriterPropertiesBuilder::default()
194    }
195
196    /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
197    /// Used for mutating existing property settings
198    pub fn into_builder(self) -> WriterPropertiesBuilder {
199        self.into()
200    }
201
202    /// Returns data page size limit.
203    ///
204    /// Note: this is a best effort limit based on the write batch size
205    ///
206    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
207    pub fn data_page_size_limit(&self) -> usize {
208        self.data_page_size_limit
209    }
210
211    /// Returns dictionary page size limit.
212    ///
213    /// Note: this is a best effort limit based on the write batch size
214    ///
215    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
216    pub fn dictionary_page_size_limit(&self) -> usize {
217        self.default_column_properties
218            .dictionary_page_size_limit()
219            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
220    }
221
222    /// Returns dictionary page size limit for a specific column.
223    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
224        self.column_properties
225            .get(col)
226            .and_then(|c| c.dictionary_page_size_limit())
227            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
228            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
229    }
230
231    /// Returns the maximum page row count
232    ///
233    /// Note: this is a best effort limit based on the write batch size
234    ///
235    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
236    pub fn data_page_row_count_limit(&self) -> usize {
237        self.data_page_row_count_limit
238    }
239
240    /// Returns configured batch size for writes.
241    ///
242    /// When writing a batch of data, this setting allows to split it internally into
243    /// smaller batches so we can better estimate the size of a page currently being
244    /// written.
245    ///
246    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
247    pub fn write_batch_size(&self) -> usize {
248        self.write_batch_size
249    }
250
251    /// Returns maximum number of rows in a row group.
252    ///
253    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
254    pub fn max_row_group_size(&self) -> usize {
255        self.max_row_group_size
256    }
257
258    /// Returns bloom filter position.
259    ///
260    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
261    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
262        self.bloom_filter_position
263    }
264
265    /// Returns configured writer version.
266    ///
267    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
268    pub fn writer_version(&self) -> WriterVersion {
269        self.writer_version
270    }
271
272    /// Returns `created_by` string.
273    ///
274    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
275    pub fn created_by(&self) -> &str {
276        &self.created_by
277    }
278
279    /// Returns `true` if offset index writing is disabled.
280    ///
281    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
282    pub fn offset_index_disabled(&self) -> bool {
283        // If page statistics are to be collected, then do not disable the offset indexes.
284        let default_page_stats_enabled =
285            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
286        let column_page_stats_enabled = self
287            .column_properties
288            .iter()
289            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
290        if default_page_stats_enabled || column_page_stats_enabled {
291            return false;
292        }
293
294        self.offset_index_disabled
295    }
296
297    /// Returns `key_value_metadata` KeyValue pairs.
298    ///
299    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
300    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
301        self.key_value_metadata.as_ref()
302    }
303
304    /// Returns sorting columns.
305    ///
306    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
307    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
308        self.sorting_columns.as_ref()
309    }
310
311    /// Returns the maximum length of truncated min/max values in the column index.
312    ///
313    /// `None` if truncation is disabled, must be greater than 0 otherwise.
314    ///
315    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
316    pub fn column_index_truncate_length(&self) -> Option<usize> {
317        self.column_index_truncate_length
318    }
319
320    /// Returns the maximum length of truncated min/max values in [`Statistics`].
321    ///
322    /// `None` if truncation is disabled, must be greater than 0 otherwise.
323    ///
324    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
325    ///
326    /// [`Statistics`]: crate::file::statistics::Statistics
327    pub fn statistics_truncate_length(&self) -> Option<usize> {
328        self.statistics_truncate_length
329    }
330
331    /// Returns `true` if type coercion is enabled.
332    ///
333    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
334    pub fn coerce_types(&self) -> bool {
335        self.coerce_types
336    }
337
338    /// Returns encoding for a data page, when dictionary encoding is enabled.
339    ///
340    /// This is not configurable.
341    #[inline]
342    pub fn dictionary_data_page_encoding(&self) -> Encoding {
343        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
344        // Dictionary values are encoded using RLE_DICTIONARY encoding.
345        Encoding::RLE_DICTIONARY
346    }
347
348    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
349    ///
350    /// This is not configurable.
351    #[inline]
352    pub fn dictionary_page_encoding(&self) -> Encoding {
353        // PLAIN_DICTIONARY is deprecated in writer version 1.
354        // Dictionary is encoded using plain encoding.
355        Encoding::PLAIN
356    }
357
358    /// Returns encoding for a column, if set.
359    ///
360    /// In case when dictionary is enabled, returns fallback encoding.
361    ///
362    /// If encoding is not set, then column writer will choose the best encoding
363    /// based on the column type.
364    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
365        self.column_properties
366            .get(col)
367            .and_then(|c| c.encoding())
368            .or_else(|| self.default_column_properties.encoding())
369    }
370
371    /// Returns compression codec for a column.
372    ///
373    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
374    pub fn compression(&self, col: &ColumnPath) -> Compression {
375        self.column_properties
376            .get(col)
377            .and_then(|c| c.compression())
378            .or_else(|| self.default_column_properties.compression())
379            .unwrap_or(DEFAULT_COMPRESSION)
380    }
381
382    /// Returns `true` if dictionary encoding is enabled for a column.
383    ///
384    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
385    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
386        self.column_properties
387            .get(col)
388            .and_then(|c| c.dictionary_enabled())
389            .or_else(|| self.default_column_properties.dictionary_enabled())
390            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
391    }
392
393    /// Returns which statistics are written for a column.
394    ///
395    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
396    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
397        self.column_properties
398            .get(col)
399            .and_then(|c| c.statistics_enabled())
400            .or_else(|| self.default_column_properties.statistics_enabled())
401            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
402    }
403
404    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
405    ///
406    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
407    ///
408    /// [`Statistics`]: crate::file::statistics::Statistics
409    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
410        self.column_properties
411            .get(col)
412            .and_then(|c| c.write_page_header_statistics())
413            .or_else(|| {
414                self.default_column_properties
415                    .write_page_header_statistics()
416            })
417            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
418    }
419
420    /// Returns the [`BloomFilterProperties`] for the given column
421    ///
422    /// Returns `None` if bloom filter is disabled
423    ///
424    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
425    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
426        self.column_properties
427            .get(col)
428            .and_then(|c| c.bloom_filter_properties())
429            .or_else(|| self.default_column_properties.bloom_filter_properties())
430    }
431
432    /// Return file encryption properties
433    ///
434    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
435    #[cfg(feature = "encryption")]
436    pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
437        self.file_encryption_properties.as_ref()
438    }
439}
440
441/// Builder for  [`WriterProperties`] Parquet writer configuration.
442///
443/// See example on [`WriterProperties`]
444#[derive(Debug, Clone)]
445pub struct WriterPropertiesBuilder {
446    data_page_size_limit: usize,
447    data_page_row_count_limit: usize,
448    write_batch_size: usize,
449    max_row_group_size: usize,
450    bloom_filter_position: BloomFilterPosition,
451    writer_version: WriterVersion,
452    created_by: String,
453    offset_index_disabled: bool,
454    key_value_metadata: Option<Vec<KeyValue>>,
455    default_column_properties: ColumnProperties,
456    column_properties: HashMap<ColumnPath, ColumnProperties>,
457    sorting_columns: Option<Vec<SortingColumn>>,
458    column_index_truncate_length: Option<usize>,
459    statistics_truncate_length: Option<usize>,
460    coerce_types: bool,
461    #[cfg(feature = "encryption")]
462    file_encryption_properties: Option<FileEncryptionProperties>,
463}
464
465impl Default for WriterPropertiesBuilder {
466    /// Returns default state of the builder.
467    fn default() -> Self {
468        Self {
469            data_page_size_limit: DEFAULT_PAGE_SIZE,
470            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
471            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
472            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
473            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
474            writer_version: DEFAULT_WRITER_VERSION,
475            created_by: DEFAULT_CREATED_BY.to_string(),
476            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
477            key_value_metadata: None,
478            default_column_properties: Default::default(),
479            column_properties: HashMap::new(),
480            sorting_columns: None,
481            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
482            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
483            coerce_types: DEFAULT_COERCE_TYPES,
484            #[cfg(feature = "encryption")]
485            file_encryption_properties: None,
486        }
487    }
488}
489
490impl WriterPropertiesBuilder {
491    /// Finalizes the configuration and returns immutable writer properties struct.
492    pub fn build(self) -> WriterProperties {
493        WriterProperties {
494            data_page_size_limit: self.data_page_size_limit,
495            data_page_row_count_limit: self.data_page_row_count_limit,
496            write_batch_size: self.write_batch_size,
497            max_row_group_size: self.max_row_group_size,
498            bloom_filter_position: self.bloom_filter_position,
499            writer_version: self.writer_version,
500            created_by: self.created_by,
501            offset_index_disabled: self.offset_index_disabled,
502            key_value_metadata: self.key_value_metadata,
503            default_column_properties: self.default_column_properties,
504            column_properties: self.column_properties,
505            sorting_columns: self.sorting_columns,
506            column_index_truncate_length: self.column_index_truncate_length,
507            statistics_truncate_length: self.statistics_truncate_length,
508            coerce_types: self.coerce_types,
509            #[cfg(feature = "encryption")]
510            file_encryption_properties: self.file_encryption_properties,
511        }
512    }
513
514    // ----------------------------------------------------------------------
515    // Writer properties related to a file
516
517    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
518    /// via [`DEFAULT_WRITER_VERSION`])
519    ///
520    /// This value can determine what features some readers will support.
521    ///
522    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
523    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
524        self.writer_version = value;
525        self
526    }
527
528    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
529    /// via [`DEFAULT_PAGE_SIZE`]).
530    ///
531    /// The parquet writer will attempt to limit the sizes of each
532    /// `DataPage` to this many bytes. Reducing this value will result
533    /// in larger parquet files, but may improve the effectiveness of
534    /// page index based predicate pushdown during reading.
535    ///
536    /// Note: this is a best effort limit based on value of
537    /// [`set_write_batch_size`](Self::set_write_batch_size).
538    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
539        self.data_page_size_limit = value;
540        self
541    }
542
543    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
544    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
545    ///
546    /// The parquet writer will attempt to limit the number of rows in
547    /// each `DataPage` to this value. Reducing this value will result
548    /// in larger parquet files, but may improve the effectiveness of
549    /// page index based predicate pushdown during reading.
550    ///
551    /// Note: this is a best effort limit based on value of
552    /// [`set_write_batch_size`](Self::set_write_batch_size).
553    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
554        self.data_page_row_count_limit = value;
555        self
556    }
557
558    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
559    ///
560    /// For performance reasons, data for each column is written in
561    /// batches of this size.
562    ///
563    /// Additional limits such as such as
564    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
565    /// are checked between batches, and thus the write batch size value acts as an
566    /// upper-bound on the enforcement granularity of other limits.
567    pub fn set_write_batch_size(mut self, value: usize) -> Self {
568        self.write_batch_size = value;
569        self
570    }
571
572    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
573    /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
574    ///
575    /// # Panics
576    /// If the value is set to 0.
577    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
578        assert!(value > 0, "Cannot have a 0 max row group size");
579        self.max_row_group_size = value;
580        self
581    }
582
583    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
584    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
585    ///
586    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
587    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
588        self.bloom_filter_position = value;
589        self
590    }
591
592    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
593    /// [`DEFAULT_CREATED_BY`]).
594    ///
595    /// This is a string that will be written into the file metadata
596    pub fn set_created_by(mut self, value: String) -> Self {
597        self.created_by = value;
598        self
599    }
600
601    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
602    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
603    ///
604    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
605    ///
606    /// Note: As the offset indexes are useful for accessing data by row number,
607    /// they are always written by default, regardless of whether other statistics
608    /// are enabled. Disabling this metadata may result in a degradation in read
609    /// performance, so use this option with care.
610    ///
611    /// [`Page`]: EnabledStatistics::Page
612    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
613        self.offset_index_disabled = value;
614        self
615    }
616
617    /// Sets "key_value_metadata" property (defaults to `None`).
618    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
619        self.key_value_metadata = value;
620        self
621    }
622
623    /// Sets sorting order of rows in the row group if any (defaults to `None`).
624    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
625        self.sorting_columns = value;
626        self
627    }
628
629    /// Sets the max length of min/max value fields when writing the column
630    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
631    ///
632    /// This can be used to prevent columns with very long values (hundreds of
633    /// bytes long) from causing the parquet metadata to become huge.
634    ///
635    /// # Notes
636    ///
637    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
638    /// set to [`EnabledStatistics::Page`].
639    ///
640    /// * If `Some`, must be greater than 0, otherwise will panic
641    /// * If `None`, there's no effective limit.
642    ///
643    /// [`Index`]: crate::file::page_index::index::Index
644    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
645        if let Some(value) = max_length {
646            assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
647        }
648
649        self.column_index_truncate_length = max_length;
650        self
651    }
652
653    /// Sets the max length of min/max value fields in row group and data page header
654    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
655    ///
656    /// # Notes
657    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
658    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
659    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
660    /// [`EnabledStatistics::Page`].
661    ///
662    /// * If `Some`, must be greater than 0, otherwise will panic
663    /// * If `None`, there's no effective limit.
664    ///
665    /// # See also
666    /// Truncation of Page Index statistics is controlled separately via
667    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
668    ///
669    /// [`Statistics`]: crate::file::statistics::Statistics
670    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
671        if let Some(value) = max_length {
672            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
673        }
674
675        self.statistics_truncate_length = max_length;
676        self
677    }
678
679    /// Should the writer coerce types to parquet native types (defaults to `false` via
680    /// [`DEFAULT_COERCE_TYPES`]).
681    ///
682    /// Leaving this option the default `false` will ensure the exact same data
683    /// written to parquet using this library will be read.
684    ///
685    /// Setting this option to `true` will result in parquet files that can be
686    /// read by more readers, but potentially lose information in the process.
687    ///
688    /// * Types such as [`DataType::Date64`], which have no direct corresponding
689    ///   Parquet type, may be stored with lower precision.
690    ///
691    /// * The internal field names of `List` and `Map` types will be renamed if
692    ///   necessary to match what is required by the newest Parquet specification.
693    ///
694    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
695    ///
696    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
697    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
698    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
699        self.coerce_types = coerce_types;
700        self
701    }
702
703    /// Sets FileEncryptionProperties (defaults to `None`)
704    #[cfg(feature = "encryption")]
705    pub fn with_file_encryption_properties(
706        mut self,
707        file_encryption_properties: FileEncryptionProperties,
708    ) -> Self {
709        self.file_encryption_properties = Some(file_encryption_properties);
710        self
711    }
712
713    // ----------------------------------------------------------------------
714    // Setters for any column (global)
715
716    /// Sets default encoding for all columns.
717    ///
718    /// If dictionary is not enabled, this is treated as a primary encoding for all
719    /// columns. In case when dictionary is enabled for any column, this value is
720    /// considered to be a fallback encoding for that column.
721    ///
722    /// # Panics
723    ///
724    /// if dictionary encoding is specified, regardless of dictionary
725    /// encoding flag being set.
726    pub fn set_encoding(mut self, value: Encoding) -> Self {
727        self.default_column_properties.set_encoding(value);
728        self
729    }
730
731    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
732    /// [`DEFAULT_COMPRESSION`]).
733    ///
734    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
735    pub fn set_compression(mut self, value: Compression) -> Self {
736        self.default_column_properties.set_compression(value);
737        self
738    }
739
740    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
741    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
742    ///
743    /// Use this method to set dictionary encoding, instead of explicitly specifying
744    /// encoding in `set_encoding` method.
745    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
746        self.default_column_properties.set_dictionary_enabled(value);
747        self
748    }
749
750    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
751    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
752    ///
753    /// The parquet writer will attempt to limit the size of each
754    /// `DataPage` used to store dictionaries to this many
755    /// bytes. Reducing this value will result in larger parquet
756    /// files, but may improve the effectiveness of page index based
757    /// predicate pushdown during reading.
758    ///
759    /// Note: this is a best effort limit based on value of
760    /// [`set_write_batch_size`](Self::set_write_batch_size).
761    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
762        self.default_column_properties
763            .set_dictionary_page_size_limit(value);
764        self
765    }
766
767    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
768    /// [`DEFAULT_STATISTICS_ENABLED`]).
769    ///
770    /// [`Page`]: EnabledStatistics::Page
771    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
772        self.default_column_properties.set_statistics_enabled(value);
773        self
774    }
775
776    /// enable/disable writing [`Statistics`] in the page header
777    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
778    ///
779    /// Only applicable if [`Page`] level statistics are gathered.
780    ///
781    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
782    /// file while yielding very little added benefit. Most modern Parquet implementations
783    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
784    /// those in the page header.
785    ///
786    /// # Note
787    ///
788    /// Prior to version 56.0.0, the `parquet` crate always wrote these
789    /// statistics (the equivalent of setting this option to `true`). This was
790    /// changed in 56.0.0 to follow the recommendation in the Parquet
791    /// specification. See [issue #7580] for more details.
792    ///
793    /// [`Statistics`]: crate::file::statistics::Statistics
794    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
795    /// [`Page`]: EnabledStatistics::Page
796    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
797    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
798        self.default_column_properties
799            .set_write_page_header_statistics(value);
800        self
801    }
802
803    /// Sets if bloom filter should be written for all columns (defaults to `false`).
804    ///
805    /// # Notes
806    ///
807    /// * If the bloom filter is enabled previously then it is a no-op.
808    ///
809    /// * If the bloom filter is not enabled, default values for ndv and fpp
810    ///   value are used used. See [`set_bloom_filter_ndv`] and
811    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
812    ///
813    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
814    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
815    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
816        self.default_column_properties
817            .set_bloom_filter_enabled(value);
818        self
819    }
820
821    /// Sets the default target bloom filter false positive probability (fpp)
822    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
823    ///
824    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
825    /// been called.
826    ///
827    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
828    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
829        self.default_column_properties.set_bloom_filter_fpp(value);
830        self
831    }
832
833    /// Sets default number of distinct values (ndv) for bloom filter for all
834    /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
835    ///
836    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
837    /// been called.
838    ///
839    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
840    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
841        self.default_column_properties.set_bloom_filter_ndv(value);
842        self
843    }
844
845    // ----------------------------------------------------------------------
846    // Setters for a specific column
847
848    /// Helper method to get existing or new mutable reference of column properties.
849    #[inline]
850    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
851        self.column_properties.entry(col).or_default()
852    }
853
854    /// Sets encoding for a specific column.
855    ///
856    /// Takes precedence over [`Self::set_encoding`].
857    ///
858    /// If dictionary is not enabled, this is treated as a primary encoding for this
859    /// column. In case when dictionary is enabled for this column, either through
860    /// global defaults or explicitly, this value is considered to be a fallback
861    /// encoding for this column.
862    ///
863    /// # Panics
864    /// If user tries to set dictionary encoding here, regardless of dictionary
865    /// encoding flag being set.
866    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
867        self.get_mut_props(col).set_encoding(value);
868        self
869    }
870
871    /// Sets compression codec for a specific column.
872    ///
873    /// Takes precedence over [`Self::set_compression`].
874    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
875        self.get_mut_props(col).set_compression(value);
876        self
877    }
878
879    /// Sets flag to enable/disable dictionary encoding for a specific column.
880    ///
881    /// Takes precedence over [`Self::set_dictionary_enabled`].
882    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
883        self.get_mut_props(col).set_dictionary_enabled(value);
884        self
885    }
886
887    /// Sets dictionary page size limit for a specific column.
888    ///
889    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
890    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
891        self.get_mut_props(col)
892            .set_dictionary_page_size_limit(value);
893        self
894    }
895
896    /// Sets [`EnabledStatistics`] level for a specific column.
897    ///
898    /// Takes precedence over [`Self::set_statistics_enabled`].
899    pub fn set_column_statistics_enabled(
900        mut self,
901        col: ColumnPath,
902        value: EnabledStatistics,
903    ) -> Self {
904        self.get_mut_props(col).set_statistics_enabled(value);
905        self
906    }
907
908    /// Sets whether to write [`Statistics`] in the page header for a specific column.
909    ///
910    /// Takes precedence over [`Self::set_write_page_header_statistics`].
911    ///
912    /// [`Statistics`]: crate::file::statistics::Statistics
913    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
914        self.get_mut_props(col)
915            .set_write_page_header_statistics(value);
916        self
917    }
918
919    /// Sets whether a bloom filter should be written for a specific column.
920    ///
921    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
922    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
923        self.get_mut_props(col).set_bloom_filter_enabled(value);
924        self
925    }
926
927    /// Sets the false positive probability for bloom filter for a specific column.
928    ///
929    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
930    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
931        self.get_mut_props(col).set_bloom_filter_fpp(value);
932        self
933    }
934
935    /// Sets the number of distinct values for bloom filter for a specific column.
936    ///
937    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
938    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
939        self.get_mut_props(col).set_bloom_filter_ndv(value);
940        self
941    }
942}
943
944impl From<WriterProperties> for WriterPropertiesBuilder {
945    fn from(props: WriterProperties) -> Self {
946        WriterPropertiesBuilder {
947            data_page_size_limit: props.data_page_size_limit,
948            data_page_row_count_limit: props.data_page_row_count_limit,
949            write_batch_size: props.write_batch_size,
950            max_row_group_size: props.max_row_group_size,
951            bloom_filter_position: props.bloom_filter_position,
952            writer_version: props.writer_version,
953            created_by: props.created_by,
954            offset_index_disabled: props.offset_index_disabled,
955            key_value_metadata: props.key_value_metadata,
956            default_column_properties: props.default_column_properties,
957            column_properties: props.column_properties,
958            sorting_columns: props.sorting_columns,
959            column_index_truncate_length: props.column_index_truncate_length,
960            statistics_truncate_length: props.statistics_truncate_length,
961            coerce_types: props.coerce_types,
962            #[cfg(feature = "encryption")]
963            file_encryption_properties: props.file_encryption_properties,
964        }
965    }
966}
967
968/// Controls the level of statistics to be computed by the writer and stored in
969/// the parquet file.
970///
971/// Enabling statistics makes the resulting Parquet file larger and requires
972/// more time to read the parquet footer.
973///
974/// Statistics can be used to improve query performance by pruning row groups
975/// and pages during query execution if the query engine supports evaluating the
976/// predicate using the statistics.
977#[derive(Debug, Clone, Copy, Eq, PartialEq)]
978pub enum EnabledStatistics {
979    /// Compute no statistics.
980    None,
981    /// Compute column chunk-level statistics but not page-level.
982    ///
983    /// Setting this option will store one set of statistics for each relevant
984    /// column for each row group. The more row groups written, the more
985    /// statistics will be stored.
986    Chunk,
987    /// Compute page-level and column chunk-level statistics.
988    ///
989    /// Setting this option will store one set of statistics for each relevant
990    /// column for each row group. In addition, this will enable the writing
991    /// of the column index (the offset index is always written regardless of
992    /// this setting). See [`ParquetColumnIndex`] for
993    /// more information.
994    ///
995    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
996    Page,
997}
998
999impl FromStr for EnabledStatistics {
1000    type Err = String;
1001
1002    fn from_str(s: &str) -> Result<Self, Self::Err> {
1003        match s {
1004            "NONE" | "none" => Ok(EnabledStatistics::None),
1005            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1006            "PAGE" | "page" => Ok(EnabledStatistics::Page),
1007            _ => Err(format!("Invalid statistics arg: {s}")),
1008        }
1009    }
1010}
1011
1012impl Default for EnabledStatistics {
1013    fn default() -> Self {
1014        DEFAULT_STATISTICS_ENABLED
1015    }
1016}
1017
1018/// Controls the bloom filter to be computed by the writer.
1019#[derive(Debug, Clone, PartialEq)]
1020pub struct BloomFilterProperties {
1021    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1022    ///
1023    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1024    ///
1025    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1026    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1027    /// e.g. 0.1, 0.05, or 0.001 is recommended.
1028    ///
1029    /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
1030    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1031    /// be known in advance to greatly reduce space usage.
1032    pub fpp: f64,
1033    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1034    ///
1035    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1036    ///
1037    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1038    /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1039    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1040    /// anyway.
1041    ///
1042    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1043    pub ndv: u64,
1044}
1045
1046impl Default for BloomFilterProperties {
1047    fn default() -> Self {
1048        BloomFilterProperties {
1049            fpp: DEFAULT_BLOOM_FILTER_FPP,
1050            ndv: DEFAULT_BLOOM_FILTER_NDV,
1051        }
1052    }
1053}
1054
1055/// Container for column properties that can be changed as part of writer.
1056///
1057/// If a field is `None`, it means that no specific value has been set for this column,
1058/// so some subsequent or default value must be used.
1059#[derive(Debug, Clone, Default, PartialEq)]
1060struct ColumnProperties {
1061    encoding: Option<Encoding>,
1062    codec: Option<Compression>,
1063    dictionary_page_size_limit: Option<usize>,
1064    dictionary_enabled: Option<bool>,
1065    statistics_enabled: Option<EnabledStatistics>,
1066    write_page_header_statistics: Option<bool>,
1067    /// bloom filter related properties
1068    bloom_filter_properties: Option<BloomFilterProperties>,
1069}
1070
1071impl ColumnProperties {
1072    /// Sets encoding for this column.
1073    ///
1074    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1075    /// In case when dictionary is enabled for a column, this value is considered to
1076    /// be a fallback encoding.
1077    ///
1078    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1079    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1080    /// for a column.
1081    fn set_encoding(&mut self, value: Encoding) {
1082        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1083            panic!("Dictionary encoding can not be used as fallback encoding");
1084        }
1085        self.encoding = Some(value);
1086    }
1087
1088    /// Sets compression codec for this column.
1089    fn set_compression(&mut self, value: Compression) {
1090        self.codec = Some(value);
1091    }
1092
1093    /// Sets whether dictionary encoding is enabled for this column.
1094    fn set_dictionary_enabled(&mut self, enabled: bool) {
1095        self.dictionary_enabled = Some(enabled);
1096    }
1097
1098    /// Sets dictionary page size limit for this column.
1099    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1100        self.dictionary_page_size_limit = Some(value);
1101    }
1102
1103    /// Sets the statistics level for this column.
1104    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1105        self.statistics_enabled = Some(enabled);
1106    }
1107
1108    /// Sets whether to write statistics in the page header for this column.
1109    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1110        self.write_page_header_statistics = Some(enabled);
1111    }
1112
1113    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1114    /// otherwise it is a no-op.
1115    /// If `value` is `false`, resets bloom filter properties to `None`.
1116    fn set_bloom_filter_enabled(&mut self, value: bool) {
1117        if value && self.bloom_filter_properties.is_none() {
1118            self.bloom_filter_properties = Some(Default::default())
1119        } else if !value {
1120            self.bloom_filter_properties = None
1121        }
1122    }
1123
1124    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1125    /// bloom filter if not previously enabled.
1126    ///
1127    /// # Panics
1128    ///
1129    /// Panics if the `value` is not between 0 and 1 exclusive
1130    fn set_bloom_filter_fpp(&mut self, value: f64) {
1131        assert!(
1132            value > 0. && value < 1.0,
1133            "fpp must be between 0 and 1 exclusive, got {value}"
1134        );
1135
1136        self.bloom_filter_properties
1137            .get_or_insert_with(Default::default)
1138            .fpp = value;
1139    }
1140
1141    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1142    /// enables bloom filter if not previously enabled.
1143    fn set_bloom_filter_ndv(&mut self, value: u64) {
1144        self.bloom_filter_properties
1145            .get_or_insert_with(Default::default)
1146            .ndv = value;
1147    }
1148
1149    /// Returns optional encoding for this column.
1150    fn encoding(&self) -> Option<Encoding> {
1151        self.encoding
1152    }
1153
1154    /// Returns optional compression codec for this column.
1155    fn compression(&self) -> Option<Compression> {
1156        self.codec
1157    }
1158
1159    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1160    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1161    /// been provided.
1162    fn dictionary_enabled(&self) -> Option<bool> {
1163        self.dictionary_enabled
1164    }
1165
1166    /// Returns optional dictionary page size limit for this column.
1167    fn dictionary_page_size_limit(&self) -> Option<usize> {
1168        self.dictionary_page_size_limit
1169    }
1170
1171    /// Returns optional statistics level requested for this column. If result is `None`,
1172    /// then no setting has been provided.
1173    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1174        self.statistics_enabled
1175    }
1176
1177    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1178    /// column.
1179    ///
1180    /// [`Statistics`]: crate::file::statistics::Statistics
1181    fn write_page_header_statistics(&self) -> Option<bool> {
1182        self.write_page_header_statistics
1183    }
1184
1185    /// Returns the bloom filter properties, or `None` if not enabled
1186    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1187        self.bloom_filter_properties.as_ref()
1188    }
1189}
1190
1191/// Reference counted reader properties.
1192pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1193
1194const DEFAULT_READ_BLOOM_FILTER: bool = false;
1195
1196/// Configuration settings for reading parquet files.
1197///
1198/// All properties are immutable and `Send` + `Sync`.
1199/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1200///
1201/// # Example
1202///
1203/// ```rust
1204/// use parquet::file::properties::ReaderProperties;
1205///
1206/// // Create properties with default configuration.
1207/// let props = ReaderProperties::builder().build();
1208///
1209/// // Use properties builder to set certain options and assemble the configuration.
1210/// let props = ReaderProperties::builder()
1211///     .set_backward_compatible_lz4(false)
1212///     .build();
1213/// ```
1214pub struct ReaderProperties {
1215    codec_options: CodecOptions,
1216    read_bloom_filter: bool,
1217}
1218
1219impl ReaderProperties {
1220    /// Returns builder for reader properties with default values.
1221    pub fn builder() -> ReaderPropertiesBuilder {
1222        ReaderPropertiesBuilder::with_defaults()
1223    }
1224
1225    /// Returns codec options.
1226    pub(crate) fn codec_options(&self) -> &CodecOptions {
1227        &self.codec_options
1228    }
1229
1230    /// Returns whether to read bloom filter
1231    pub(crate) fn read_bloom_filter(&self) -> bool {
1232        self.read_bloom_filter
1233    }
1234}
1235
1236/// Builder for parquet file reader configuration. See example on
1237/// [`ReaderProperties`]
1238pub struct ReaderPropertiesBuilder {
1239    codec_options_builder: CodecOptionsBuilder,
1240    read_bloom_filter: Option<bool>,
1241}
1242
1243/// Reader properties builder.
1244impl ReaderPropertiesBuilder {
1245    /// Returns default state of the builder.
1246    fn with_defaults() -> Self {
1247        Self {
1248            codec_options_builder: CodecOptionsBuilder::default(),
1249            read_bloom_filter: None,
1250        }
1251    }
1252
1253    /// Finalizes the configuration and returns immutable reader properties struct.
1254    pub fn build(self) -> ReaderProperties {
1255        ReaderProperties {
1256            codec_options: self.codec_options_builder.build(),
1257            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1258        }
1259    }
1260
1261    /// Enable/disable backward compatible LZ4.
1262    ///
1263    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1264    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1265    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1266    /// compatibility with files generated by older versions of parquet-cpp.
1267    ///
1268    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1269    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1270        self.codec_options_builder = self
1271            .codec_options_builder
1272            .set_backward_compatible_lz4(value);
1273        self
1274    }
1275
1276    /// Enable/disable reading bloom filter
1277    ///
1278    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1279    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1280    ///
1281    /// By default bloom filter is set to be read.
1282    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1283        self.read_bloom_filter = Some(value);
1284        self
1285    }
1286}
1287
1288#[cfg(test)]
1289mod tests {
1290    use super::*;
1291
1292    #[test]
1293    fn test_writer_version() {
1294        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1295        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1296    }
1297
1298    #[test]
1299    fn test_writer_properties_default_settings() {
1300        let props = WriterProperties::default();
1301        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1302        assert_eq!(
1303            props.dictionary_page_size_limit(),
1304            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1305        );
1306        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1307        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1308        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1309        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1310        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1311        assert_eq!(props.key_value_metadata(), None);
1312        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1313        assert_eq!(
1314            props.compression(&ColumnPath::from("col")),
1315            DEFAULT_COMPRESSION
1316        );
1317        assert_eq!(
1318            props.dictionary_enabled(&ColumnPath::from("col")),
1319            DEFAULT_DICTIONARY_ENABLED
1320        );
1321        assert_eq!(
1322            props.statistics_enabled(&ColumnPath::from("col")),
1323            DEFAULT_STATISTICS_ENABLED
1324        );
1325        assert!(props
1326            .bloom_filter_properties(&ColumnPath::from("col"))
1327            .is_none());
1328    }
1329
1330    #[test]
1331    fn test_writer_properties_dictionary_encoding() {
1332        // dictionary encoding is not configurable, and it should be the same for both
1333        // writer version 1 and 2.
1334        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1335            let props = WriterProperties::builder()
1336                .set_writer_version(*version)
1337                .build();
1338            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1339            assert_eq!(
1340                props.dictionary_data_page_encoding(),
1341                Encoding::RLE_DICTIONARY
1342            );
1343        }
1344    }
1345
1346    #[test]
1347    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1348    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1349        // Should panic when user specifies dictionary encoding as fallback encoding.
1350        WriterProperties::builder()
1351            .set_encoding(Encoding::PLAIN_DICTIONARY)
1352            .build();
1353    }
1354
1355    #[test]
1356    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1357    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1358        // Should panic when user specifies dictionary encoding as fallback encoding.
1359        WriterProperties::builder()
1360            .set_encoding(Encoding::RLE_DICTIONARY)
1361            .build();
1362    }
1363
1364    #[test]
1365    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1366    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1367        WriterProperties::builder()
1368            .set_dictionary_enabled(true)
1369            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1370            .build();
1371    }
1372
1373    #[test]
1374    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1375    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1376        WriterProperties::builder()
1377            .set_dictionary_enabled(false)
1378            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1379            .build();
1380    }
1381
1382    #[test]
1383    fn test_writer_properties_builder() {
1384        let props = WriterProperties::builder()
1385            // file settings
1386            .set_writer_version(WriterVersion::PARQUET_2_0)
1387            .set_data_page_size_limit(10)
1388            .set_dictionary_page_size_limit(20)
1389            .set_write_batch_size(30)
1390            .set_max_row_group_size(40)
1391            .set_created_by("default".to_owned())
1392            .set_key_value_metadata(Some(vec![KeyValue::new(
1393                "key".to_string(),
1394                "value".to_string(),
1395            )]))
1396            // global column settings
1397            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1398            .set_compression(Compression::GZIP(Default::default()))
1399            .set_dictionary_enabled(false)
1400            .set_statistics_enabled(EnabledStatistics::None)
1401            // specific column settings
1402            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1403            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1404            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1405            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1406            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1407            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1408            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1409            .build();
1410
1411        fn test_props(props: &WriterProperties) {
1412            assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1413            assert_eq!(props.data_page_size_limit(), 10);
1414            assert_eq!(props.dictionary_page_size_limit(), 20);
1415            assert_eq!(props.write_batch_size(), 30);
1416            assert_eq!(props.max_row_group_size(), 40);
1417            assert_eq!(props.created_by(), "default");
1418            assert_eq!(
1419                props.key_value_metadata(),
1420                Some(&vec![
1421                    KeyValue::new("key".to_string(), "value".to_string(),)
1422                ])
1423            );
1424
1425            assert_eq!(
1426                props.encoding(&ColumnPath::from("a")),
1427                Some(Encoding::DELTA_BINARY_PACKED)
1428            );
1429            assert_eq!(
1430                props.compression(&ColumnPath::from("a")),
1431                Compression::GZIP(Default::default())
1432            );
1433            assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1434            assert_eq!(
1435                props.statistics_enabled(&ColumnPath::from("a")),
1436                EnabledStatistics::None
1437            );
1438
1439            assert_eq!(
1440                props.encoding(&ColumnPath::from("col")),
1441                Some(Encoding::RLE)
1442            );
1443            assert_eq!(
1444                props.compression(&ColumnPath::from("col")),
1445                Compression::SNAPPY
1446            );
1447            assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1448            assert_eq!(
1449                props.statistics_enabled(&ColumnPath::from("col")),
1450                EnabledStatistics::Chunk
1451            );
1452            assert_eq!(
1453                props.bloom_filter_properties(&ColumnPath::from("col")),
1454                Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1455            );
1456        }
1457
1458        // Test direct build of properties
1459        test_props(&props);
1460
1461        // Test that into_builder() gives the same result
1462        let props_into_builder_and_back = props.into_builder().build();
1463        test_props(&props_into_builder_and_back);
1464    }
1465
1466    #[test]
1467    fn test_writer_properties_builder_partial_defaults() {
1468        let props = WriterProperties::builder()
1469            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1470            .set_compression(Compression::GZIP(Default::default()))
1471            .set_bloom_filter_enabled(true)
1472            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1473            .build();
1474
1475        assert_eq!(
1476            props.encoding(&ColumnPath::from("col")),
1477            Some(Encoding::RLE)
1478        );
1479        assert_eq!(
1480            props.compression(&ColumnPath::from("col")),
1481            Compression::GZIP(Default::default())
1482        );
1483        assert_eq!(
1484            props.dictionary_enabled(&ColumnPath::from("col")),
1485            DEFAULT_DICTIONARY_ENABLED
1486        );
1487        assert_eq!(
1488            props.bloom_filter_properties(&ColumnPath::from("col")),
1489            Some(&BloomFilterProperties {
1490                fpp: 0.05,
1491                ndv: 1_000_000_u64
1492            })
1493        );
1494    }
1495
1496    #[test]
1497    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1498        assert_eq!(
1499            WriterProperties::builder()
1500                .build()
1501                .bloom_filter_properties(&ColumnPath::from("col")),
1502            None
1503        );
1504        assert_eq!(
1505            WriterProperties::builder()
1506                .set_bloom_filter_ndv(100)
1507                .build()
1508                .bloom_filter_properties(&ColumnPath::from("col")),
1509            Some(&BloomFilterProperties {
1510                fpp: 0.05,
1511                ndv: 100
1512            })
1513        );
1514        assert_eq!(
1515            WriterProperties::builder()
1516                .set_bloom_filter_fpp(0.1)
1517                .build()
1518                .bloom_filter_properties(&ColumnPath::from("col")),
1519            Some(&BloomFilterProperties {
1520                fpp: 0.1,
1521                ndv: 1_000_000_u64
1522            })
1523        );
1524    }
1525
1526    #[test]
1527    fn test_writer_properties_column_dictionary_page_size_limit() {
1528        let props = WriterProperties::builder()
1529            .set_dictionary_page_size_limit(100)
1530            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1531            .build();
1532
1533        assert_eq!(props.dictionary_page_size_limit(), 100);
1534        assert_eq!(
1535            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1536            10
1537        );
1538        assert_eq!(
1539            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1540            100
1541        );
1542    }
1543
1544    #[test]
1545    fn test_reader_properties_default_settings() {
1546        let props = ReaderProperties::builder().build();
1547
1548        let codec_options = CodecOptionsBuilder::default()
1549            .set_backward_compatible_lz4(true)
1550            .build();
1551
1552        assert_eq!(props.codec_options(), &codec_options);
1553        assert!(!props.read_bloom_filter());
1554    }
1555
1556    #[test]
1557    fn test_reader_properties_builder() {
1558        let props = ReaderProperties::builder()
1559            .set_backward_compatible_lz4(false)
1560            .build();
1561
1562        let codec_options = CodecOptionsBuilder::default()
1563            .set_backward_compatible_lz4(false)
1564            .build();
1565
1566        assert_eq!(props.codec_options(), &codec_options);
1567    }
1568
1569    #[test]
1570    fn test_parse_writerversion() {
1571        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1572        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1573        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1574        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1575
1576        // test lowercase
1577        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1578        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1579
1580        // test invalid version
1581        match "PARQUET_-1_0".parse::<WriterVersion>() {
1582            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1583            Err(e) => {
1584                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1585            }
1586        }
1587    }
1588
1589    #[test]
1590    fn test_parse_enabledstatistics() {
1591        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1592        assert_eq!(enabled_statistics, EnabledStatistics::None);
1593        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1594        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1595        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1596        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1597
1598        // test lowercase
1599        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1600        assert_eq!(enabled_statistics, EnabledStatistics::None);
1601
1602        //test invalid statistics
1603        match "ChunkAndPage".parse::<EnabledStatistics>() {
1604            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1605            Err(e) => {
1606                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1607            }
1608        }
1609    }
1610}