parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_size`]
47pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
60/// Default value for [`WriterProperties::offset_index_disabled`]
61pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
62/// Default values for [`WriterProperties::coerce_types`]
63pub const DEFAULT_COERCE_TYPES: bool = false;
64
65/// Parquet writer version.
66///
67/// Basic constant, which is not part of the Thrift definition.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69#[allow(non_camel_case_types)]
70pub enum WriterVersion {
71    /// Parquet format version 1.0
72    PARQUET_1_0,
73    /// Parquet format version 2.0
74    PARQUET_2_0,
75}
76
77impl WriterVersion {
78    /// Returns writer version as `i32`.
79    pub fn as_num(&self) -> i32 {
80        match self {
81            WriterVersion::PARQUET_1_0 => 1,
82            WriterVersion::PARQUET_2_0 => 2,
83        }
84    }
85}
86
87impl FromStr for WriterVersion {
88    type Err = String;
89
90    fn from_str(s: &str) -> Result<Self, Self::Err> {
91        match s {
92            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
93            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
94            _ => Err(format!("Invalid writer version: {s}")),
95        }
96    }
97}
98
99/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
100/// write Bloom filters
101///
102/// Basic constant, which is not part of the Thrift definition.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum BloomFilterPosition {
105    /// Write Bloom Filters of each row group right after the row group
106    ///
107    /// This saves memory by writing it as soon as it is computed, at the cost
108    /// of data locality for readers
109    AfterRowGroup,
110    /// Write Bloom Filters at the end of the file
111    ///
112    /// This allows better data locality for readers, at the cost of memory usage
113    /// for writers.
114    End,
115}
116
117/// Reference counted writer properties.
118pub type WriterPropertiesPtr = Arc<WriterProperties>;
119
120/// Configuration settings for writing parquet files.
121///
122/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
123///
124/// # Example
125///
126/// ```rust
127/// # use parquet::{
128/// #    basic::{Compression, Encoding},
129/// #    file::properties::*,
130/// #    schema::types::ColumnPath,
131/// # };
132/// #
133/// // Create properties with default configuration.
134/// let props = WriterProperties::default();
135///
136/// // Use properties builder to set certain options and assemble the configuration.
137/// let props = WriterProperties::builder()
138///     .set_writer_version(WriterVersion::PARQUET_1_0)
139///     .set_encoding(Encoding::PLAIN)
140///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
141///     .set_compression(Compression::SNAPPY)
142///     .build();
143///
144/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
145/// assert_eq!(
146///     props.encoding(&ColumnPath::from("col1")),
147///     Some(Encoding::DELTA_BINARY_PACKED)
148/// );
149/// assert_eq!(
150///     props.encoding(&ColumnPath::from("col2")),
151///     Some(Encoding::PLAIN)
152/// );
153/// ```
154#[derive(Debug, Clone)]
155pub struct WriterProperties {
156    data_page_size_limit: usize,
157    data_page_row_count_limit: usize,
158    write_batch_size: usize,
159    max_row_group_size: usize,
160    bloom_filter_position: BloomFilterPosition,
161    writer_version: WriterVersion,
162    created_by: String,
163    offset_index_disabled: bool,
164    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
165    default_column_properties: ColumnProperties,
166    column_properties: HashMap<ColumnPath, ColumnProperties>,
167    sorting_columns: Option<Vec<SortingColumn>>,
168    column_index_truncate_length: Option<usize>,
169    statistics_truncate_length: Option<usize>,
170    coerce_types: bool,
171    #[cfg(feature = "encryption")]
172    pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
173}
174
175impl Default for WriterProperties {
176    fn default() -> Self {
177        Self::builder().build()
178    }
179}
180
181impl WriterProperties {
182    /// Create a new [`WriterProperties`] with the default settings
183    ///
184    /// See [`WriterProperties::builder`] for customising settings
185    pub fn new() -> Self {
186        Self::default()
187    }
188
189    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
190    /// properties.
191    pub fn builder() -> WriterPropertiesBuilder {
192        WriterPropertiesBuilder::default()
193    }
194
195    /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
196    /// Used for mutating existing property settings
197    pub fn into_builder(self) -> WriterPropertiesBuilder {
198        self.into()
199    }
200
201    /// Returns data page size limit.
202    ///
203    /// Note: this is a best effort limit based on the write batch size
204    ///
205    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
206    pub fn data_page_size_limit(&self) -> usize {
207        self.data_page_size_limit
208    }
209
210    /// Returns dictionary page size limit.
211    ///
212    /// Note: this is a best effort limit based on the write batch size
213    ///
214    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
215    pub fn dictionary_page_size_limit(&self) -> usize {
216        self.default_column_properties
217            .dictionary_page_size_limit()
218            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
219    }
220
221    /// Returns dictionary page size limit for a specific column.
222    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
223        self.column_properties
224            .get(col)
225            .and_then(|c| c.dictionary_page_size_limit())
226            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
227            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
228    }
229
230    /// Returns the maximum page row count
231    ///
232    /// Note: this is a best effort limit based on the write batch size
233    ///
234    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
235    pub fn data_page_row_count_limit(&self) -> usize {
236        self.data_page_row_count_limit
237    }
238
239    /// Returns configured batch size for writes.
240    ///
241    /// When writing a batch of data, this setting allows to split it internally into
242    /// smaller batches so we can better estimate the size of a page currently being
243    /// written.
244    ///
245    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
246    pub fn write_batch_size(&self) -> usize {
247        self.write_batch_size
248    }
249
250    /// Returns maximum number of rows in a row group.
251    ///
252    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
253    pub fn max_row_group_size(&self) -> usize {
254        self.max_row_group_size
255    }
256
257    /// Returns bloom filter position.
258    ///
259    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
260    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
261        self.bloom_filter_position
262    }
263
264    /// Returns configured writer version.
265    ///
266    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
267    pub fn writer_version(&self) -> WriterVersion {
268        self.writer_version
269    }
270
271    /// Returns `created_by` string.
272    ///
273    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
274    pub fn created_by(&self) -> &str {
275        &self.created_by
276    }
277
278    /// Returns `true` if offset index writing is disabled.
279    ///
280    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
281    pub fn offset_index_disabled(&self) -> bool {
282        // If page statistics are to be collected, then do not disable the offset indexes.
283        let default_page_stats_enabled =
284            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
285        let column_page_stats_enabled = self
286            .column_properties
287            .iter()
288            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
289        if default_page_stats_enabled || column_page_stats_enabled {
290            return false;
291        }
292
293        self.offset_index_disabled
294    }
295
296    /// Returns `key_value_metadata` KeyValue pairs.
297    ///
298    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
299    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
300        self.key_value_metadata.as_ref()
301    }
302
303    /// Returns sorting columns.
304    ///
305    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
306    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
307        self.sorting_columns.as_ref()
308    }
309
310    /// Returns the maximum length of truncated min/max values in the column index.
311    ///
312    /// `None` if truncation is disabled, must be greater than 0 otherwise.
313    ///
314    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
315    pub fn column_index_truncate_length(&self) -> Option<usize> {
316        self.column_index_truncate_length
317    }
318
319    /// Returns the maximum length of truncated min/max values in [`Statistics`].
320    ///
321    /// `None` if truncation is disabled, must be greater than 0 otherwise.
322    ///
323    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
324    ///
325    /// [`Statistics`]: crate::file::statistics::Statistics
326    pub fn statistics_truncate_length(&self) -> Option<usize> {
327        self.statistics_truncate_length
328    }
329
330    /// Returns `true` if type coercion is enabled.
331    ///
332    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
333    pub fn coerce_types(&self) -> bool {
334        self.coerce_types
335    }
336
337    /// Returns encoding for a data page, when dictionary encoding is enabled.
338    ///
339    /// This is not configurable.
340    #[inline]
341    pub fn dictionary_data_page_encoding(&self) -> Encoding {
342        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
343        // Dictionary values are encoded using RLE_DICTIONARY encoding.
344        Encoding::RLE_DICTIONARY
345    }
346
347    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
348    ///
349    /// This is not configurable.
350    #[inline]
351    pub fn dictionary_page_encoding(&self) -> Encoding {
352        // PLAIN_DICTIONARY is deprecated in writer version 1.
353        // Dictionary is encoded using plain encoding.
354        Encoding::PLAIN
355    }
356
357    /// Returns encoding for a column, if set.
358    ///
359    /// In case when dictionary is enabled, returns fallback encoding.
360    ///
361    /// If encoding is not set, then column writer will choose the best encoding
362    /// based on the column type.
363    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
364        self.column_properties
365            .get(col)
366            .and_then(|c| c.encoding())
367            .or_else(|| self.default_column_properties.encoding())
368    }
369
370    /// Returns compression codec for a column.
371    ///
372    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
373    pub fn compression(&self, col: &ColumnPath) -> Compression {
374        self.column_properties
375            .get(col)
376            .and_then(|c| c.compression())
377            .or_else(|| self.default_column_properties.compression())
378            .unwrap_or(DEFAULT_COMPRESSION)
379    }
380
381    /// Returns `true` if dictionary encoding is enabled for a column.
382    ///
383    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
384    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
385        self.column_properties
386            .get(col)
387            .and_then(|c| c.dictionary_enabled())
388            .or_else(|| self.default_column_properties.dictionary_enabled())
389            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
390    }
391
392    /// Returns which statistics are written for a column.
393    ///
394    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
395    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
396        self.column_properties
397            .get(col)
398            .and_then(|c| c.statistics_enabled())
399            .or_else(|| self.default_column_properties.statistics_enabled())
400            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
401    }
402
403    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
404    ///
405    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
406    ///
407    /// [`Statistics`]: crate::file::statistics::Statistics
408    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
409        self.column_properties
410            .get(col)
411            .and_then(|c| c.write_page_header_statistics())
412            .or_else(|| {
413                self.default_column_properties
414                    .write_page_header_statistics()
415            })
416            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
417    }
418
419    /// Returns the [`BloomFilterProperties`] for the given column
420    ///
421    /// Returns `None` if bloom filter is disabled
422    ///
423    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
424    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
425        self.column_properties
426            .get(col)
427            .and_then(|c| c.bloom_filter_properties())
428            .or_else(|| self.default_column_properties.bloom_filter_properties())
429    }
430
431    /// Return file encryption properties
432    ///
433    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
434    #[cfg(feature = "encryption")]
435    pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
436        self.file_encryption_properties.as_ref()
437    }
438}
439
440/// Builder for  [`WriterProperties`] Parquet writer configuration.
441///
442/// See example on [`WriterProperties`]
443#[derive(Debug, Clone)]
444pub struct WriterPropertiesBuilder {
445    data_page_size_limit: usize,
446    data_page_row_count_limit: usize,
447    write_batch_size: usize,
448    max_row_group_size: usize,
449    bloom_filter_position: BloomFilterPosition,
450    writer_version: WriterVersion,
451    created_by: String,
452    offset_index_disabled: bool,
453    key_value_metadata: Option<Vec<KeyValue>>,
454    default_column_properties: ColumnProperties,
455    column_properties: HashMap<ColumnPath, ColumnProperties>,
456    sorting_columns: Option<Vec<SortingColumn>>,
457    column_index_truncate_length: Option<usize>,
458    statistics_truncate_length: Option<usize>,
459    coerce_types: bool,
460    #[cfg(feature = "encryption")]
461    file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
462}
463
464impl Default for WriterPropertiesBuilder {
465    /// Returns default state of the builder.
466    fn default() -> Self {
467        Self {
468            data_page_size_limit: DEFAULT_PAGE_SIZE,
469            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
470            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
471            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
472            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
473            writer_version: DEFAULT_WRITER_VERSION,
474            created_by: DEFAULT_CREATED_BY.to_string(),
475            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
476            key_value_metadata: None,
477            default_column_properties: Default::default(),
478            column_properties: HashMap::new(),
479            sorting_columns: None,
480            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
481            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
482            coerce_types: DEFAULT_COERCE_TYPES,
483            #[cfg(feature = "encryption")]
484            file_encryption_properties: None,
485        }
486    }
487}
488
489impl WriterPropertiesBuilder {
490    /// Finalizes the configuration and returns immutable writer properties struct.
491    pub fn build(self) -> WriterProperties {
492        WriterProperties {
493            data_page_size_limit: self.data_page_size_limit,
494            data_page_row_count_limit: self.data_page_row_count_limit,
495            write_batch_size: self.write_batch_size,
496            max_row_group_size: self.max_row_group_size,
497            bloom_filter_position: self.bloom_filter_position,
498            writer_version: self.writer_version,
499            created_by: self.created_by,
500            offset_index_disabled: self.offset_index_disabled,
501            key_value_metadata: self.key_value_metadata,
502            default_column_properties: self.default_column_properties,
503            column_properties: self.column_properties,
504            sorting_columns: self.sorting_columns,
505            column_index_truncate_length: self.column_index_truncate_length,
506            statistics_truncate_length: self.statistics_truncate_length,
507            coerce_types: self.coerce_types,
508            #[cfg(feature = "encryption")]
509            file_encryption_properties: self.file_encryption_properties,
510        }
511    }
512
513    // ----------------------------------------------------------------------
514    // Writer properties related to a file
515
516    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
517    /// via [`DEFAULT_WRITER_VERSION`])
518    ///
519    /// This value can determine what features some readers will support.
520    ///
521    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
522    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
523        self.writer_version = value;
524        self
525    }
526
527    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
528    /// via [`DEFAULT_PAGE_SIZE`]).
529    ///
530    /// The parquet writer will attempt to limit the sizes of each
531    /// `DataPage` to this many bytes. Reducing this value will result
532    /// in larger parquet files, but may improve the effectiveness of
533    /// page index based predicate pushdown during reading.
534    ///
535    /// Note: this is a best effort limit based on value of
536    /// [`set_write_batch_size`](Self::set_write_batch_size).
537    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
538        self.data_page_size_limit = value;
539        self
540    }
541
542    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
543    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
544    ///
545    /// The parquet writer will attempt to limit the number of rows in
546    /// each `DataPage` to this value. Reducing this value will result
547    /// in larger parquet files, but may improve the effectiveness of
548    /// page index based predicate pushdown during reading.
549    ///
550    /// Note: this is a best effort limit based on value of
551    /// [`set_write_batch_size`](Self::set_write_batch_size).
552    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
553        self.data_page_row_count_limit = value;
554        self
555    }
556
557    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
558    ///
559    /// For performance reasons, data for each column is written in
560    /// batches of this size.
561    ///
562    /// Additional limits such as such as
563    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
564    /// are checked between batches, and thus the write batch size value acts as an
565    /// upper-bound on the enforcement granularity of other limits.
566    pub fn set_write_batch_size(mut self, value: usize) -> Self {
567        self.write_batch_size = value;
568        self
569    }
570
571    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
572    /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
573    ///
574    /// # Panics
575    /// If the value is set to 0.
576    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
577        assert!(value > 0, "Cannot have a 0 max row group size");
578        self.max_row_group_size = value;
579        self
580    }
581
582    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
583    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
584    ///
585    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
586    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
587        self.bloom_filter_position = value;
588        self
589    }
590
591    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
592    /// [`DEFAULT_CREATED_BY`]).
593    ///
594    /// This is a string that will be written into the file metadata
595    pub fn set_created_by(mut self, value: String) -> Self {
596        self.created_by = value;
597        self
598    }
599
600    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
601    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
602    ///
603    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
604    ///
605    /// Note: As the offset indexes are useful for accessing data by row number,
606    /// they are always written by default, regardless of whether other statistics
607    /// are enabled. Disabling this metadata may result in a degradation in read
608    /// performance, so use this option with care.
609    ///
610    /// [`Page`]: EnabledStatistics::Page
611    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
612        self.offset_index_disabled = value;
613        self
614    }
615
616    /// Sets "key_value_metadata" property (defaults to `None`).
617    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
618        self.key_value_metadata = value;
619        self
620    }
621
622    /// Sets sorting order of rows in the row group if any (defaults to `None`).
623    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
624        self.sorting_columns = value;
625        self
626    }
627
628    /// Sets the max length of min/max value fields when writing the column
629    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
630    ///
631    /// This can be used to prevent columns with very long values (hundreds of
632    /// bytes long) from causing the parquet metadata to become huge.
633    ///
634    /// # Notes
635    ///
636    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
637    /// set to [`EnabledStatistics::Page`].
638    ///
639    /// * If `Some`, must be greater than 0, otherwise will panic
640    /// * If `None`, there's no effective limit.
641    ///
642    /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
643    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
644        if let Some(value) = max_length {
645            assert!(
646                value > 0,
647                "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
648            );
649        }
650
651        self.column_index_truncate_length = max_length;
652        self
653    }
654
655    /// Sets the max length of min/max value fields in row group and data page header
656    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
657    ///
658    /// # Notes
659    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
660    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
661    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
662    /// [`EnabledStatistics::Page`].
663    ///
664    /// * If `Some`, must be greater than 0, otherwise will panic
665    /// * If `None`, there's no effective limit.
666    ///
667    /// # See also
668    /// Truncation of Page Index statistics is controlled separately via
669    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
670    ///
671    /// [`Statistics`]: crate::file::statistics::Statistics
672    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
673        if let Some(value) = max_length {
674            assert!(
675                value > 0,
676                "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
677            );
678        }
679
680        self.statistics_truncate_length = max_length;
681        self
682    }
683
684    /// Should the writer coerce types to parquet native types (defaults to `false` via
685    /// [`DEFAULT_COERCE_TYPES`]).
686    ///
687    /// Leaving this option the default `false` will ensure the exact same data
688    /// written to parquet using this library will be read.
689    ///
690    /// Setting this option to `true` will result in parquet files that can be
691    /// read by more readers, but potentially lose information in the process.
692    ///
693    /// * Types such as [`DataType::Date64`], which have no direct corresponding
694    ///   Parquet type, may be stored with lower precision.
695    ///
696    /// * The internal field names of `List` and `Map` types will be renamed if
697    ///   necessary to match what is required by the newest Parquet specification.
698    ///
699    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
700    ///
701    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
702    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
703    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
704        self.coerce_types = coerce_types;
705        self
706    }
707
708    /// Sets FileEncryptionProperties (defaults to `None`)
709    #[cfg(feature = "encryption")]
710    pub fn with_file_encryption_properties(
711        mut self,
712        file_encryption_properties: Arc<FileEncryptionProperties>,
713    ) -> Self {
714        self.file_encryption_properties = Some(file_encryption_properties);
715        self
716    }
717
718    // ----------------------------------------------------------------------
719    // Setters for any column (global)
720
721    /// Sets default encoding for all columns.
722    ///
723    /// If dictionary is not enabled, this is treated as a primary encoding for all
724    /// columns. In case when dictionary is enabled for any column, this value is
725    /// considered to be a fallback encoding for that column.
726    ///
727    /// # Panics
728    ///
729    /// if dictionary encoding is specified, regardless of dictionary
730    /// encoding flag being set.
731    pub fn set_encoding(mut self, value: Encoding) -> Self {
732        self.default_column_properties.set_encoding(value);
733        self
734    }
735
736    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
737    /// [`DEFAULT_COMPRESSION`]).
738    ///
739    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
740    pub fn set_compression(mut self, value: Compression) -> Self {
741        self.default_column_properties.set_compression(value);
742        self
743    }
744
745    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
746    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
747    ///
748    /// Use this method to set dictionary encoding, instead of explicitly specifying
749    /// encoding in `set_encoding` method.
750    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
751        self.default_column_properties.set_dictionary_enabled(value);
752        self
753    }
754
755    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
756    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
757    ///
758    /// The parquet writer will attempt to limit the size of each
759    /// `DataPage` used to store dictionaries to this many
760    /// bytes. Reducing this value will result in larger parquet
761    /// files, but may improve the effectiveness of page index based
762    /// predicate pushdown during reading.
763    ///
764    /// Note: this is a best effort limit based on value of
765    /// [`set_write_batch_size`](Self::set_write_batch_size).
766    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
767        self.default_column_properties
768            .set_dictionary_page_size_limit(value);
769        self
770    }
771
772    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
773    /// [`DEFAULT_STATISTICS_ENABLED`]).
774    ///
775    /// [`Page`]: EnabledStatistics::Page
776    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
777        self.default_column_properties.set_statistics_enabled(value);
778        self
779    }
780
781    /// enable/disable writing [`Statistics`] in the page header
782    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
783    ///
784    /// Only applicable if [`Page`] level statistics are gathered.
785    ///
786    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
787    /// file while yielding very little added benefit. Most modern Parquet implementations
788    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
789    /// those in the page header.
790    ///
791    /// # Note
792    ///
793    /// Prior to version 56.0.0, the `parquet` crate always wrote these
794    /// statistics (the equivalent of setting this option to `true`). This was
795    /// changed in 56.0.0 to follow the recommendation in the Parquet
796    /// specification. See [issue #7580] for more details.
797    ///
798    /// [`Statistics`]: crate::file::statistics::Statistics
799    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
800    /// [`Page`]: EnabledStatistics::Page
801    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
802    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
803        self.default_column_properties
804            .set_write_page_header_statistics(value);
805        self
806    }
807
808    /// Sets if bloom filter should be written for all columns (defaults to `false`).
809    ///
810    /// # Notes
811    ///
812    /// * If the bloom filter is enabled previously then it is a no-op.
813    ///
814    /// * If the bloom filter is not enabled, default values for ndv and fpp
815    ///   value are used used. See [`set_bloom_filter_ndv`] and
816    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
817    ///
818    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
819    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
820    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
821        self.default_column_properties
822            .set_bloom_filter_enabled(value);
823        self
824    }
825
826    /// Sets the default target bloom filter false positive probability (fpp)
827    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
828    ///
829    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
830    /// been called.
831    ///
832    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
833    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
834        self.default_column_properties.set_bloom_filter_fpp(value);
835        self
836    }
837
838    /// Sets default number of distinct values (ndv) for bloom filter for all
839    /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
840    ///
841    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
842    /// been called.
843    ///
844    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
845    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
846        self.default_column_properties.set_bloom_filter_ndv(value);
847        self
848    }
849
850    // ----------------------------------------------------------------------
851    // Setters for a specific column
852
853    /// Helper method to get existing or new mutable reference of column properties.
854    #[inline]
855    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
856        self.column_properties.entry(col).or_default()
857    }
858
859    /// Sets encoding for a specific column.
860    ///
861    /// Takes precedence over [`Self::set_encoding`].
862    ///
863    /// If dictionary is not enabled, this is treated as a primary encoding for this
864    /// column. In case when dictionary is enabled for this column, either through
865    /// global defaults or explicitly, this value is considered to be a fallback
866    /// encoding for this column.
867    ///
868    /// # Panics
869    /// If user tries to set dictionary encoding here, regardless of dictionary
870    /// encoding flag being set.
871    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
872        self.get_mut_props(col).set_encoding(value);
873        self
874    }
875
876    /// Sets compression codec for a specific column.
877    ///
878    /// Takes precedence over [`Self::set_compression`].
879    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
880        self.get_mut_props(col).set_compression(value);
881        self
882    }
883
884    /// Sets flag to enable/disable dictionary encoding for a specific column.
885    ///
886    /// Takes precedence over [`Self::set_dictionary_enabled`].
887    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
888        self.get_mut_props(col).set_dictionary_enabled(value);
889        self
890    }
891
892    /// Sets dictionary page size limit for a specific column.
893    ///
894    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
895    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
896        self.get_mut_props(col)
897            .set_dictionary_page_size_limit(value);
898        self
899    }
900
901    /// Sets [`EnabledStatistics`] level for a specific column.
902    ///
903    /// Takes precedence over [`Self::set_statistics_enabled`].
904    pub fn set_column_statistics_enabled(
905        mut self,
906        col: ColumnPath,
907        value: EnabledStatistics,
908    ) -> Self {
909        self.get_mut_props(col).set_statistics_enabled(value);
910        self
911    }
912
913    /// Sets whether to write [`Statistics`] in the page header for a specific column.
914    ///
915    /// Takes precedence over [`Self::set_write_page_header_statistics`].
916    ///
917    /// [`Statistics`]: crate::file::statistics::Statistics
918    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
919        self.get_mut_props(col)
920            .set_write_page_header_statistics(value);
921        self
922    }
923
924    /// Sets whether a bloom filter should be written for a specific column.
925    ///
926    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
927    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
928        self.get_mut_props(col).set_bloom_filter_enabled(value);
929        self
930    }
931
932    /// Sets the false positive probability for bloom filter for a specific column.
933    ///
934    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
935    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
936        self.get_mut_props(col).set_bloom_filter_fpp(value);
937        self
938    }
939
940    /// Sets the number of distinct values for bloom filter for a specific column.
941    ///
942    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
943    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
944        self.get_mut_props(col).set_bloom_filter_ndv(value);
945        self
946    }
947}
948
949impl From<WriterProperties> for WriterPropertiesBuilder {
950    fn from(props: WriterProperties) -> Self {
951        WriterPropertiesBuilder {
952            data_page_size_limit: props.data_page_size_limit,
953            data_page_row_count_limit: props.data_page_row_count_limit,
954            write_batch_size: props.write_batch_size,
955            max_row_group_size: props.max_row_group_size,
956            bloom_filter_position: props.bloom_filter_position,
957            writer_version: props.writer_version,
958            created_by: props.created_by,
959            offset_index_disabled: props.offset_index_disabled,
960            key_value_metadata: props.key_value_metadata,
961            default_column_properties: props.default_column_properties,
962            column_properties: props.column_properties,
963            sorting_columns: props.sorting_columns,
964            column_index_truncate_length: props.column_index_truncate_length,
965            statistics_truncate_length: props.statistics_truncate_length,
966            coerce_types: props.coerce_types,
967            #[cfg(feature = "encryption")]
968            file_encryption_properties: props.file_encryption_properties,
969        }
970    }
971}
972
973/// Controls the level of statistics to be computed by the writer and stored in
974/// the parquet file.
975///
976/// Enabling statistics makes the resulting Parquet file larger and requires
977/// more time to read the parquet footer.
978///
979/// Statistics can be used to improve query performance by pruning row groups
980/// and pages during query execution if the query engine supports evaluating the
981/// predicate using the statistics.
982#[derive(Debug, Clone, Copy, Eq, PartialEq)]
983pub enum EnabledStatistics {
984    /// Compute no statistics.
985    None,
986    /// Compute column chunk-level statistics but not page-level.
987    ///
988    /// Setting this option will store one set of statistics for each relevant
989    /// column for each row group. The more row groups written, the more
990    /// statistics will be stored.
991    Chunk,
992    /// Compute page-level and column chunk-level statistics.
993    ///
994    /// Setting this option will store one set of statistics for each relevant
995    /// column for each row group. In addition, this will enable the writing
996    /// of the column index (the offset index is always written regardless of
997    /// this setting). See [`ParquetColumnIndex`] for
998    /// more information.
999    ///
1000    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1001    Page,
1002}
1003
1004impl FromStr for EnabledStatistics {
1005    type Err = String;
1006
1007    fn from_str(s: &str) -> Result<Self, Self::Err> {
1008        match s {
1009            "NONE" | "none" => Ok(EnabledStatistics::None),
1010            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1011            "PAGE" | "page" => Ok(EnabledStatistics::Page),
1012            _ => Err(format!("Invalid statistics arg: {s}")),
1013        }
1014    }
1015}
1016
1017impl Default for EnabledStatistics {
1018    fn default() -> Self {
1019        DEFAULT_STATISTICS_ENABLED
1020    }
1021}
1022
1023/// Controls the bloom filter to be computed by the writer.
1024#[derive(Debug, Clone, PartialEq)]
1025pub struct BloomFilterProperties {
1026    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1027    ///
1028    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1029    ///
1030    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1031    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1032    /// e.g. 0.1, 0.05, or 0.001 is recommended.
1033    ///
1034    /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
1035    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1036    /// be known in advance to greatly reduce space usage.
1037    pub fpp: f64,
1038    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1039    ///
1040    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1041    ///
1042    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1043    /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1044    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1045    /// anyway.
1046    ///
1047    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1048    pub ndv: u64,
1049}
1050
1051impl Default for BloomFilterProperties {
1052    fn default() -> Self {
1053        BloomFilterProperties {
1054            fpp: DEFAULT_BLOOM_FILTER_FPP,
1055            ndv: DEFAULT_BLOOM_FILTER_NDV,
1056        }
1057    }
1058}
1059
1060/// Container for column properties that can be changed as part of writer.
1061///
1062/// If a field is `None`, it means that no specific value has been set for this column,
1063/// so some subsequent or default value must be used.
1064#[derive(Debug, Clone, Default, PartialEq)]
1065struct ColumnProperties {
1066    encoding: Option<Encoding>,
1067    codec: Option<Compression>,
1068    dictionary_page_size_limit: Option<usize>,
1069    dictionary_enabled: Option<bool>,
1070    statistics_enabled: Option<EnabledStatistics>,
1071    write_page_header_statistics: Option<bool>,
1072    /// bloom filter related properties
1073    bloom_filter_properties: Option<BloomFilterProperties>,
1074}
1075
1076impl ColumnProperties {
1077    /// Sets encoding for this column.
1078    ///
1079    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1080    /// In case when dictionary is enabled for a column, this value is considered to
1081    /// be a fallback encoding.
1082    ///
1083    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1084    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1085    /// for a column.
1086    fn set_encoding(&mut self, value: Encoding) {
1087        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1088            panic!("Dictionary encoding can not be used as fallback encoding");
1089        }
1090        self.encoding = Some(value);
1091    }
1092
1093    /// Sets compression codec for this column.
1094    fn set_compression(&mut self, value: Compression) {
1095        self.codec = Some(value);
1096    }
1097
1098    /// Sets whether dictionary encoding is enabled for this column.
1099    fn set_dictionary_enabled(&mut self, enabled: bool) {
1100        self.dictionary_enabled = Some(enabled);
1101    }
1102
1103    /// Sets dictionary page size limit for this column.
1104    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1105        self.dictionary_page_size_limit = Some(value);
1106    }
1107
1108    /// Sets the statistics level for this column.
1109    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1110        self.statistics_enabled = Some(enabled);
1111    }
1112
1113    /// Sets whether to write statistics in the page header for this column.
1114    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1115        self.write_page_header_statistics = Some(enabled);
1116    }
1117
1118    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1119    /// otherwise it is a no-op.
1120    /// If `value` is `false`, resets bloom filter properties to `None`.
1121    fn set_bloom_filter_enabled(&mut self, value: bool) {
1122        if value && self.bloom_filter_properties.is_none() {
1123            self.bloom_filter_properties = Some(Default::default())
1124        } else if !value {
1125            self.bloom_filter_properties = None
1126        }
1127    }
1128
1129    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1130    /// bloom filter if not previously enabled.
1131    ///
1132    /// # Panics
1133    ///
1134    /// Panics if the `value` is not between 0 and 1 exclusive
1135    fn set_bloom_filter_fpp(&mut self, value: f64) {
1136        assert!(
1137            value > 0. && value < 1.0,
1138            "fpp must be between 0 and 1 exclusive, got {value}"
1139        );
1140
1141        self.bloom_filter_properties
1142            .get_or_insert_with(Default::default)
1143            .fpp = value;
1144    }
1145
1146    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1147    /// enables bloom filter if not previously enabled.
1148    fn set_bloom_filter_ndv(&mut self, value: u64) {
1149        self.bloom_filter_properties
1150            .get_or_insert_with(Default::default)
1151            .ndv = value;
1152    }
1153
1154    /// Returns optional encoding for this column.
1155    fn encoding(&self) -> Option<Encoding> {
1156        self.encoding
1157    }
1158
1159    /// Returns optional compression codec for this column.
1160    fn compression(&self) -> Option<Compression> {
1161        self.codec
1162    }
1163
1164    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1165    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1166    /// been provided.
1167    fn dictionary_enabled(&self) -> Option<bool> {
1168        self.dictionary_enabled
1169    }
1170
1171    /// Returns optional dictionary page size limit for this column.
1172    fn dictionary_page_size_limit(&self) -> Option<usize> {
1173        self.dictionary_page_size_limit
1174    }
1175
1176    /// Returns optional statistics level requested for this column. If result is `None`,
1177    /// then no setting has been provided.
1178    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1179        self.statistics_enabled
1180    }
1181
1182    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1183    /// column.
1184    ///
1185    /// [`Statistics`]: crate::file::statistics::Statistics
1186    fn write_page_header_statistics(&self) -> Option<bool> {
1187        self.write_page_header_statistics
1188    }
1189
1190    /// Returns the bloom filter properties, or `None` if not enabled
1191    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1192        self.bloom_filter_properties.as_ref()
1193    }
1194}
1195
1196/// Reference counted reader properties.
1197pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1198
1199const DEFAULT_READ_BLOOM_FILTER: bool = false;
1200const DEFAULT_READ_PAGE_STATS: bool = false;
1201
1202/// Configuration settings for reading parquet files.
1203///
1204/// All properties are immutable and `Send` + `Sync`.
1205/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1206///
1207/// # Example
1208///
1209/// ```rust
1210/// use parquet::file::properties::ReaderProperties;
1211///
1212/// // Create properties with default configuration.
1213/// let props = ReaderProperties::builder().build();
1214///
1215/// // Use properties builder to set certain options and assemble the configuration.
1216/// let props = ReaderProperties::builder()
1217///     .set_backward_compatible_lz4(false)
1218///     .build();
1219/// ```
1220pub struct ReaderProperties {
1221    codec_options: CodecOptions,
1222    read_bloom_filter: bool,
1223    read_page_stats: bool,
1224}
1225
1226impl ReaderProperties {
1227    /// Returns builder for reader properties with default values.
1228    pub fn builder() -> ReaderPropertiesBuilder {
1229        ReaderPropertiesBuilder::with_defaults()
1230    }
1231
1232    /// Returns codec options.
1233    pub(crate) fn codec_options(&self) -> &CodecOptions {
1234        &self.codec_options
1235    }
1236
1237    /// Returns whether to read bloom filter
1238    pub(crate) fn read_bloom_filter(&self) -> bool {
1239        self.read_bloom_filter
1240    }
1241
1242    /// Returns whether to read page level statistics
1243    pub(crate) fn read_page_stats(&self) -> bool {
1244        self.read_page_stats
1245    }
1246}
1247
1248/// Builder for parquet file reader configuration. See example on
1249/// [`ReaderProperties`]
1250pub struct ReaderPropertiesBuilder {
1251    codec_options_builder: CodecOptionsBuilder,
1252    read_bloom_filter: Option<bool>,
1253    read_page_stats: Option<bool>,
1254}
1255
1256/// Reader properties builder.
1257impl ReaderPropertiesBuilder {
1258    /// Returns default state of the builder.
1259    fn with_defaults() -> Self {
1260        Self {
1261            codec_options_builder: CodecOptionsBuilder::default(),
1262            read_bloom_filter: None,
1263            read_page_stats: None,
1264        }
1265    }
1266
1267    /// Finalizes the configuration and returns immutable reader properties struct.
1268    pub fn build(self) -> ReaderProperties {
1269        ReaderProperties {
1270            codec_options: self.codec_options_builder.build(),
1271            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1272            read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1273        }
1274    }
1275
1276    /// Enable/disable backward compatible LZ4.
1277    ///
1278    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1279    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1280    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1281    /// compatibility with files generated by older versions of parquet-cpp.
1282    ///
1283    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1284    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1285        self.codec_options_builder = self
1286            .codec_options_builder
1287            .set_backward_compatible_lz4(value);
1288        self
1289    }
1290
1291    /// Enable/disable reading bloom filter
1292    ///
1293    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1294    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1295    ///
1296    /// By default bloom filter is set to be read.
1297    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1298        self.read_bloom_filter = Some(value);
1299        self
1300    }
1301
1302    /// Enable/disable reading page-level statistics
1303    ///
1304    /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1305    /// each page, if present.
1306    /// If set to `false`, then the reader will skip decoding the statistics.
1307    ///
1308    /// By default statistics will not be decoded.
1309    ///
1310    /// [`Statistics`]: crate::file::statistics::Statistics
1311    pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1312        self.read_page_stats = Some(value);
1313        self
1314    }
1315}
1316
1317#[cfg(test)]
1318mod tests {
1319    use super::*;
1320
1321    #[test]
1322    fn test_writer_version() {
1323        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1324        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1325    }
1326
1327    #[test]
1328    fn test_writer_properties_default_settings() {
1329        let props = WriterProperties::default();
1330        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1331        assert_eq!(
1332            props.dictionary_page_size_limit(),
1333            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1334        );
1335        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1336        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1337        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1338        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1339        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1340        assert_eq!(props.key_value_metadata(), None);
1341        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1342        assert_eq!(
1343            props.compression(&ColumnPath::from("col")),
1344            DEFAULT_COMPRESSION
1345        );
1346        assert_eq!(
1347            props.dictionary_enabled(&ColumnPath::from("col")),
1348            DEFAULT_DICTIONARY_ENABLED
1349        );
1350        assert_eq!(
1351            props.statistics_enabled(&ColumnPath::from("col")),
1352            DEFAULT_STATISTICS_ENABLED
1353        );
1354        assert!(
1355            props
1356                .bloom_filter_properties(&ColumnPath::from("col"))
1357                .is_none()
1358        );
1359    }
1360
1361    #[test]
1362    fn test_writer_properties_dictionary_encoding() {
1363        // dictionary encoding is not configurable, and it should be the same for both
1364        // writer version 1 and 2.
1365        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1366            let props = WriterProperties::builder()
1367                .set_writer_version(*version)
1368                .build();
1369            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1370            assert_eq!(
1371                props.dictionary_data_page_encoding(),
1372                Encoding::RLE_DICTIONARY
1373            );
1374        }
1375    }
1376
1377    #[test]
1378    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1379    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1380        // Should panic when user specifies dictionary encoding as fallback encoding.
1381        WriterProperties::builder()
1382            .set_encoding(Encoding::PLAIN_DICTIONARY)
1383            .build();
1384    }
1385
1386    #[test]
1387    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1388    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1389        // Should panic when user specifies dictionary encoding as fallback encoding.
1390        WriterProperties::builder()
1391            .set_encoding(Encoding::RLE_DICTIONARY)
1392            .build();
1393    }
1394
1395    #[test]
1396    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1397    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1398        WriterProperties::builder()
1399            .set_dictionary_enabled(true)
1400            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1401            .build();
1402    }
1403
1404    #[test]
1405    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1406    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1407        WriterProperties::builder()
1408            .set_dictionary_enabled(false)
1409            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1410            .build();
1411    }
1412
1413    #[test]
1414    fn test_writer_properties_builder() {
1415        let props = WriterProperties::builder()
1416            // file settings
1417            .set_writer_version(WriterVersion::PARQUET_2_0)
1418            .set_data_page_size_limit(10)
1419            .set_dictionary_page_size_limit(20)
1420            .set_write_batch_size(30)
1421            .set_max_row_group_size(40)
1422            .set_created_by("default".to_owned())
1423            .set_key_value_metadata(Some(vec![KeyValue::new(
1424                "key".to_string(),
1425                "value".to_string(),
1426            )]))
1427            // global column settings
1428            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1429            .set_compression(Compression::GZIP(Default::default()))
1430            .set_dictionary_enabled(false)
1431            .set_statistics_enabled(EnabledStatistics::None)
1432            // specific column settings
1433            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1434            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1435            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1436            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1437            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1438            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1439            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1440            .build();
1441
1442        fn test_props(props: &WriterProperties) {
1443            assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1444            assert_eq!(props.data_page_size_limit(), 10);
1445            assert_eq!(props.dictionary_page_size_limit(), 20);
1446            assert_eq!(props.write_batch_size(), 30);
1447            assert_eq!(props.max_row_group_size(), 40);
1448            assert_eq!(props.created_by(), "default");
1449            assert_eq!(
1450                props.key_value_metadata(),
1451                Some(&vec![
1452                    KeyValue::new("key".to_string(), "value".to_string(),)
1453                ])
1454            );
1455
1456            assert_eq!(
1457                props.encoding(&ColumnPath::from("a")),
1458                Some(Encoding::DELTA_BINARY_PACKED)
1459            );
1460            assert_eq!(
1461                props.compression(&ColumnPath::from("a")),
1462                Compression::GZIP(Default::default())
1463            );
1464            assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1465            assert_eq!(
1466                props.statistics_enabled(&ColumnPath::from("a")),
1467                EnabledStatistics::None
1468            );
1469
1470            assert_eq!(
1471                props.encoding(&ColumnPath::from("col")),
1472                Some(Encoding::RLE)
1473            );
1474            assert_eq!(
1475                props.compression(&ColumnPath::from("col")),
1476                Compression::SNAPPY
1477            );
1478            assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1479            assert_eq!(
1480                props.statistics_enabled(&ColumnPath::from("col")),
1481                EnabledStatistics::Chunk
1482            );
1483            assert_eq!(
1484                props.bloom_filter_properties(&ColumnPath::from("col")),
1485                Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1486            );
1487        }
1488
1489        // Test direct build of properties
1490        test_props(&props);
1491
1492        // Test that into_builder() gives the same result
1493        let props_into_builder_and_back = props.into_builder().build();
1494        test_props(&props_into_builder_and_back);
1495    }
1496
1497    #[test]
1498    fn test_writer_properties_builder_partial_defaults() {
1499        let props = WriterProperties::builder()
1500            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1501            .set_compression(Compression::GZIP(Default::default()))
1502            .set_bloom_filter_enabled(true)
1503            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1504            .build();
1505
1506        assert_eq!(
1507            props.encoding(&ColumnPath::from("col")),
1508            Some(Encoding::RLE)
1509        );
1510        assert_eq!(
1511            props.compression(&ColumnPath::from("col")),
1512            Compression::GZIP(Default::default())
1513        );
1514        assert_eq!(
1515            props.dictionary_enabled(&ColumnPath::from("col")),
1516            DEFAULT_DICTIONARY_ENABLED
1517        );
1518        assert_eq!(
1519            props.bloom_filter_properties(&ColumnPath::from("col")),
1520            Some(&BloomFilterProperties {
1521                fpp: 0.05,
1522                ndv: 1_000_000_u64
1523            })
1524        );
1525    }
1526
1527    #[test]
1528    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1529        assert_eq!(
1530            WriterProperties::builder()
1531                .build()
1532                .bloom_filter_properties(&ColumnPath::from("col")),
1533            None
1534        );
1535        assert_eq!(
1536            WriterProperties::builder()
1537                .set_bloom_filter_ndv(100)
1538                .build()
1539                .bloom_filter_properties(&ColumnPath::from("col")),
1540            Some(&BloomFilterProperties {
1541                fpp: 0.05,
1542                ndv: 100
1543            })
1544        );
1545        assert_eq!(
1546            WriterProperties::builder()
1547                .set_bloom_filter_fpp(0.1)
1548                .build()
1549                .bloom_filter_properties(&ColumnPath::from("col")),
1550            Some(&BloomFilterProperties {
1551                fpp: 0.1,
1552                ndv: 1_000_000_u64
1553            })
1554        );
1555    }
1556
1557    #[test]
1558    fn test_writer_properties_column_dictionary_page_size_limit() {
1559        let props = WriterProperties::builder()
1560            .set_dictionary_page_size_limit(100)
1561            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1562            .build();
1563
1564        assert_eq!(props.dictionary_page_size_limit(), 100);
1565        assert_eq!(
1566            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1567            10
1568        );
1569        assert_eq!(
1570            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1571            100
1572        );
1573    }
1574
1575    #[test]
1576    fn test_reader_properties_default_settings() {
1577        let props = ReaderProperties::builder().build();
1578
1579        let codec_options = CodecOptionsBuilder::default()
1580            .set_backward_compatible_lz4(true)
1581            .build();
1582
1583        assert_eq!(props.codec_options(), &codec_options);
1584        assert!(!props.read_bloom_filter());
1585    }
1586
1587    #[test]
1588    fn test_reader_properties_builder() {
1589        let props = ReaderProperties::builder()
1590            .set_backward_compatible_lz4(false)
1591            .build();
1592
1593        let codec_options = CodecOptionsBuilder::default()
1594            .set_backward_compatible_lz4(false)
1595            .build();
1596
1597        assert_eq!(props.codec_options(), &codec_options);
1598    }
1599
1600    #[test]
1601    fn test_parse_writerversion() {
1602        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1603        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1604        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1605        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1606
1607        // test lowercase
1608        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1609        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1610
1611        // test invalid version
1612        match "PARQUET_-1_0".parse::<WriterVersion>() {
1613            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1614            Err(e) => {
1615                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1616            }
1617        }
1618    }
1619
1620    #[test]
1621    fn test_parse_enabledstatistics() {
1622        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1623        assert_eq!(enabled_statistics, EnabledStatistics::None);
1624        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1625        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1626        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1627        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1628
1629        // test lowercase
1630        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1631        assert_eq!(enabled_statistics, EnabledStatistics::None);
1632
1633        //test invalid statistics
1634        match "ChunkAndPage".parse::<EnabledStatistics>() {
1635            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1636            Err(e) => {
1637                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1638            }
1639        }
1640    }
1641}