Skip to main content

parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_row_count`]
47pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
60/// Default value for [`WriterProperties::offset_index_disabled`]
61pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
62/// Default values for [`WriterProperties::coerce_types`]
63pub const DEFAULT_COERCE_TYPES: bool = false;
64
65/// Parquet writer version.
66///
67/// Basic constant, which is not part of the Thrift definition.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69#[allow(non_camel_case_types)]
70pub enum WriterVersion {
71    /// Parquet format version 1.0
72    PARQUET_1_0,
73    /// Parquet format version 2.0
74    PARQUET_2_0,
75}
76
77impl WriterVersion {
78    /// Returns writer version as `i32`.
79    pub fn as_num(&self) -> i32 {
80        match self {
81            WriterVersion::PARQUET_1_0 => 1,
82            WriterVersion::PARQUET_2_0 => 2,
83        }
84    }
85}
86
87impl FromStr for WriterVersion {
88    type Err = String;
89
90    fn from_str(s: &str) -> Result<Self, Self::Err> {
91        match s {
92            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
93            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
94            _ => Err(format!("Invalid writer version: {s}")),
95        }
96    }
97}
98
99/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
100/// write Bloom filters
101///
102/// Basic constant, which is not part of the Thrift definition.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum BloomFilterPosition {
105    /// Write Bloom Filters of each row group right after the row group
106    ///
107    /// This saves memory by writing it as soon as it is computed, at the cost
108    /// of data locality for readers
109    AfterRowGroup,
110    /// Write Bloom Filters at the end of the file
111    ///
112    /// This allows better data locality for readers, at the cost of memory usage
113    /// for writers.
114    End,
115}
116
117/// Reference counted writer properties.
118pub type WriterPropertiesPtr = Arc<WriterProperties>;
119
120/// Configuration settings for writing parquet files.
121///
122/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
123///
124/// # Example
125///
126/// ```rust
127/// # use parquet::{
128/// #    basic::{Compression, Encoding},
129/// #    file::properties::*,
130/// #    schema::types::ColumnPath,
131/// # };
132/// #
133/// // Create properties with default configuration.
134/// let props = WriterProperties::default();
135///
136/// // Use properties builder to set certain options and assemble the configuration.
137/// let props = WriterProperties::builder()
138///     .set_writer_version(WriterVersion::PARQUET_1_0)
139///     .set_encoding(Encoding::PLAIN)
140///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
141///     .set_compression(Compression::SNAPPY)
142///     .build();
143///
144/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
145/// assert_eq!(
146///     props.encoding(&ColumnPath::from("col1")),
147///     Some(Encoding::DELTA_BINARY_PACKED)
148/// );
149/// assert_eq!(
150///     props.encoding(&ColumnPath::from("col2")),
151///     Some(Encoding::PLAIN)
152/// );
153/// ```
154#[derive(Debug, Clone)]
155pub struct WriterProperties {
156    data_page_row_count_limit: usize,
157    write_batch_size: usize,
158    max_row_group_row_count: Option<usize>,
159    max_row_group_bytes: Option<usize>,
160    bloom_filter_position: BloomFilterPosition,
161    writer_version: WriterVersion,
162    created_by: String,
163    offset_index_disabled: bool,
164    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
165    default_column_properties: ColumnProperties,
166    column_properties: HashMap<ColumnPath, ColumnProperties>,
167    sorting_columns: Option<Vec<SortingColumn>>,
168    column_index_truncate_length: Option<usize>,
169    statistics_truncate_length: Option<usize>,
170    coerce_types: bool,
171    #[cfg(feature = "encryption")]
172    pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
173}
174
175impl Default for WriterProperties {
176    fn default() -> Self {
177        Self::builder().build()
178    }
179}
180
181impl WriterProperties {
182    /// Create a new [`WriterProperties`] with the default settings
183    ///
184    /// See [`WriterProperties::builder`] for customising settings
185    pub fn new() -> Self {
186        Self::default()
187    }
188
189    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
190    /// properties.
191    pub fn builder() -> WriterPropertiesBuilder {
192        WriterPropertiesBuilder::default()
193    }
194
195    /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
196    /// Used for mutating existing property settings
197    pub fn into_builder(self) -> WriterPropertiesBuilder {
198        self.into()
199    }
200
201    /// Returns data page size limit.
202    ///
203    /// Note: this is a best effort limit based on the write batch size
204    ///
205    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
206    pub fn data_page_size_limit(&self) -> usize {
207        self.default_column_properties
208            .data_page_size_limit()
209            .unwrap_or(DEFAULT_PAGE_SIZE)
210    }
211
212    /// Returns data page size limit for a specific column.
213    ///
214    /// Takes precedence over [`Self::data_page_size_limit`].
215    ///
216    /// Note: this is a best effort limit based on the write batch size.
217    pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
218        self.column_properties
219            .get(col)
220            .and_then(|c| c.data_page_size_limit())
221            .or_else(|| self.default_column_properties.data_page_size_limit())
222            .unwrap_or(DEFAULT_PAGE_SIZE)
223    }
224
225    /// Returns dictionary page size limit.
226    ///
227    /// Note: this is a best effort limit based on the write batch size
228    ///
229    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
230    pub fn dictionary_page_size_limit(&self) -> usize {
231        self.default_column_properties
232            .dictionary_page_size_limit()
233            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
234    }
235
236    /// Returns dictionary page size limit for a specific column.
237    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
238        self.column_properties
239            .get(col)
240            .and_then(|c| c.dictionary_page_size_limit())
241            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
242            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
243    }
244
245    /// Returns the maximum page row count
246    ///
247    /// Note: this is a best effort limit based on the write batch size
248    ///
249    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
250    pub fn data_page_row_count_limit(&self) -> usize {
251        self.data_page_row_count_limit
252    }
253
254    /// Returns configured batch size for writes.
255    ///
256    /// When writing a batch of data, this setting allows to split it internally into
257    /// smaller batches so we can better estimate the size of a page currently being
258    /// written.
259    ///
260    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
261    pub fn write_batch_size(&self) -> usize {
262        self.write_batch_size
263    }
264
265    /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
266    ///
267    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
268    #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
269    pub fn max_row_group_size(&self) -> usize {
270        self.max_row_group_row_count.unwrap_or(usize::MAX)
271    }
272
273    /// Returns maximum number of rows in a row group, or `None` if unlimited.
274    ///
275    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
276    pub fn max_row_group_row_count(&self) -> Option<usize> {
277        self.max_row_group_row_count
278    }
279
280    /// Returns maximum size of a row group in bytes, or `None` if unlimited.
281    ///
282    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
283    pub fn max_row_group_bytes(&self) -> Option<usize> {
284        self.max_row_group_bytes
285    }
286
287    /// Returns bloom filter position.
288    ///
289    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
290    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
291        self.bloom_filter_position
292    }
293
294    /// Returns configured writer version.
295    ///
296    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
297    pub fn writer_version(&self) -> WriterVersion {
298        self.writer_version
299    }
300
301    /// Returns `created_by` string.
302    ///
303    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
304    pub fn created_by(&self) -> &str {
305        &self.created_by
306    }
307
308    /// Returns `true` if offset index writing is disabled.
309    ///
310    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
311    pub fn offset_index_disabled(&self) -> bool {
312        // If page statistics are to be collected, then do not disable the offset indexes.
313        let default_page_stats_enabled =
314            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
315        let column_page_stats_enabled = self
316            .column_properties
317            .iter()
318            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
319        if default_page_stats_enabled || column_page_stats_enabled {
320            return false;
321        }
322
323        self.offset_index_disabled
324    }
325
326    /// Returns `key_value_metadata` KeyValue pairs.
327    ///
328    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
329    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
330        self.key_value_metadata.as_ref()
331    }
332
333    /// Returns sorting columns.
334    ///
335    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
336    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
337        self.sorting_columns.as_ref()
338    }
339
340    /// Returns the maximum length of truncated min/max values in the column index.
341    ///
342    /// `None` if truncation is disabled, must be greater than 0 otherwise.
343    ///
344    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
345    pub fn column_index_truncate_length(&self) -> Option<usize> {
346        self.column_index_truncate_length
347    }
348
349    /// Returns the maximum length of truncated min/max values in [`Statistics`].
350    ///
351    /// `None` if truncation is disabled, must be greater than 0 otherwise.
352    ///
353    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
354    ///
355    /// [`Statistics`]: crate::file::statistics::Statistics
356    pub fn statistics_truncate_length(&self) -> Option<usize> {
357        self.statistics_truncate_length
358    }
359
360    /// Returns `true` if type coercion is enabled.
361    ///
362    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
363    pub fn coerce_types(&self) -> bool {
364        self.coerce_types
365    }
366
367    /// Returns encoding for a data page, when dictionary encoding is enabled.
368    ///
369    /// This is not configurable.
370    #[inline]
371    pub fn dictionary_data_page_encoding(&self) -> Encoding {
372        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
373        // Dictionary values are encoded using RLE_DICTIONARY encoding.
374        Encoding::RLE_DICTIONARY
375    }
376
377    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
378    ///
379    /// This is not configurable.
380    #[inline]
381    pub fn dictionary_page_encoding(&self) -> Encoding {
382        // PLAIN_DICTIONARY is deprecated in writer version 1.
383        // Dictionary is encoded using plain encoding.
384        Encoding::PLAIN
385    }
386
387    /// Returns encoding for a column, if set.
388    ///
389    /// In case when dictionary is enabled, returns fallback encoding.
390    ///
391    /// If encoding is not set, then column writer will choose the best encoding
392    /// based on the column type.
393    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
394        self.column_properties
395            .get(col)
396            .and_then(|c| c.encoding())
397            .or_else(|| self.default_column_properties.encoding())
398    }
399
400    /// Returns compression codec for a column.
401    ///
402    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
403    pub fn compression(&self, col: &ColumnPath) -> Compression {
404        self.column_properties
405            .get(col)
406            .and_then(|c| c.compression())
407            .or_else(|| self.default_column_properties.compression())
408            .unwrap_or(DEFAULT_COMPRESSION)
409    }
410
411    /// Returns `true` if dictionary encoding is enabled for a column.
412    ///
413    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
414    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
415        self.column_properties
416            .get(col)
417            .and_then(|c| c.dictionary_enabled())
418            .or_else(|| self.default_column_properties.dictionary_enabled())
419            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
420    }
421
422    /// Returns which statistics are written for a column.
423    ///
424    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
425    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
426        self.column_properties
427            .get(col)
428            .and_then(|c| c.statistics_enabled())
429            .or_else(|| self.default_column_properties.statistics_enabled())
430            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
431    }
432
433    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
434    ///
435    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
436    ///
437    /// [`Statistics`]: crate::file::statistics::Statistics
438    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
439        self.column_properties
440            .get(col)
441            .and_then(|c| c.write_page_header_statistics())
442            .or_else(|| {
443                self.default_column_properties
444                    .write_page_header_statistics()
445            })
446            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
447    }
448
449    /// Returns the [`BloomFilterProperties`] for the given column
450    ///
451    /// Returns `None` if bloom filter is disabled
452    ///
453    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
454    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
455        self.column_properties
456            .get(col)
457            .and_then(|c| c.bloom_filter_properties())
458            .or_else(|| self.default_column_properties.bloom_filter_properties())
459    }
460
461    /// Return file encryption properties
462    ///
463    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
464    #[cfg(feature = "encryption")]
465    pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
466        self.file_encryption_properties.as_ref()
467    }
468}
469
470/// Builder for  [`WriterProperties`] Parquet writer configuration.
471///
472/// See example on [`WriterProperties`]
473#[derive(Debug, Clone)]
474pub struct WriterPropertiesBuilder {
475    data_page_row_count_limit: usize,
476    write_batch_size: usize,
477    max_row_group_row_count: Option<usize>,
478    max_row_group_bytes: Option<usize>,
479    bloom_filter_position: BloomFilterPosition,
480    writer_version: WriterVersion,
481    created_by: String,
482    offset_index_disabled: bool,
483    key_value_metadata: Option<Vec<KeyValue>>,
484    default_column_properties: ColumnProperties,
485    column_properties: HashMap<ColumnPath, ColumnProperties>,
486    sorting_columns: Option<Vec<SortingColumn>>,
487    column_index_truncate_length: Option<usize>,
488    statistics_truncate_length: Option<usize>,
489    coerce_types: bool,
490    #[cfg(feature = "encryption")]
491    file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
492}
493
494impl Default for WriterPropertiesBuilder {
495    /// Returns default state of the builder.
496    fn default() -> Self {
497        Self {
498            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
499            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
500            max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
501            max_row_group_bytes: None,
502            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
503            writer_version: DEFAULT_WRITER_VERSION,
504            created_by: DEFAULT_CREATED_BY.to_string(),
505            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
506            key_value_metadata: None,
507            default_column_properties: Default::default(),
508            column_properties: HashMap::new(),
509            sorting_columns: None,
510            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
511            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
512            coerce_types: DEFAULT_COERCE_TYPES,
513            #[cfg(feature = "encryption")]
514            file_encryption_properties: None,
515        }
516    }
517}
518
519impl WriterPropertiesBuilder {
520    /// Finalizes the configuration and returns immutable writer properties struct.
521    pub fn build(self) -> WriterProperties {
522        WriterProperties {
523            data_page_row_count_limit: self.data_page_row_count_limit,
524            write_batch_size: self.write_batch_size,
525            max_row_group_row_count: self.max_row_group_row_count,
526            max_row_group_bytes: self.max_row_group_bytes,
527            bloom_filter_position: self.bloom_filter_position,
528            writer_version: self.writer_version,
529            created_by: self.created_by,
530            offset_index_disabled: self.offset_index_disabled,
531            key_value_metadata: self.key_value_metadata,
532            default_column_properties: self.default_column_properties,
533            column_properties: self.column_properties,
534            sorting_columns: self.sorting_columns,
535            column_index_truncate_length: self.column_index_truncate_length,
536            statistics_truncate_length: self.statistics_truncate_length,
537            coerce_types: self.coerce_types,
538            #[cfg(feature = "encryption")]
539            file_encryption_properties: self.file_encryption_properties,
540        }
541    }
542
543    // ----------------------------------------------------------------------
544    // Writer properties related to a file
545
546    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
547    /// via [`DEFAULT_WRITER_VERSION`])
548    ///
549    /// This value can determine what features some readers will support.
550    ///
551    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
552    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
553        self.writer_version = value;
554        self
555    }
556
557    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
558    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
559    ///
560    /// The parquet writer will attempt to limit the number of rows in
561    /// each `DataPage` to this value. Reducing this value will result
562    /// in larger parquet files, but may improve the effectiveness of
563    /// page index based predicate pushdown during reading.
564    ///
565    /// Note: this is a best effort limit based on value of
566    /// [`set_write_batch_size`](Self::set_write_batch_size).
567    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
568        self.data_page_row_count_limit = value;
569        self
570    }
571
572    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
573    ///
574    /// For performance reasons, data for each column is written in
575    /// batches of this size.
576    ///
577    /// Additional limits such as such as
578    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
579    /// are checked between batches, and thus the write batch size value acts as an
580    /// upper-bound on the enforcement granularity of other limits.
581    pub fn set_write_batch_size(mut self, value: usize) -> Self {
582        self.write_batch_size = value;
583        self
584    }
585
586    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
587    /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
588    ///
589    /// # Panics
590    /// If the value is set to 0.
591    #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
592    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
593        assert!(value > 0, "Cannot have a 0 max row group size");
594        self.max_row_group_row_count = Some(value);
595        self
596    }
597
598    /// Sets maximum number of rows in a row group, or `None` for unlimited.
599    ///
600    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
601    /// the row group with the smaller limit will be produced.
602    ///
603    /// # Panics
604    /// If the value is `Some(0)`.
605    pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
606        assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
607        self.max_row_group_row_count = value;
608        self
609    }
610
611    /// Sets maximum size of a row group in bytes, or `None` for unlimited.
612    ///
613    /// Row groups are flushed when their estimated encoded size exceeds this threshold.
614    /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
615    ///
616    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
617    /// the row group with the smaller limit will be produced.
618    ///
619    /// # Panics
620    /// If the value is `Some(0)`.
621    pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
622        assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
623        self.max_row_group_bytes = value;
624        self
625    }
626
627    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
628    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
629    ///
630    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
631    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
632        self.bloom_filter_position = value;
633        self
634    }
635
636    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
637    /// [`DEFAULT_CREATED_BY`]).
638    ///
639    /// This is a string that will be written into the file metadata
640    pub fn set_created_by(mut self, value: String) -> Self {
641        self.created_by = value;
642        self
643    }
644
645    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
646    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
647    ///
648    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
649    ///
650    /// Note: As the offset indexes are useful for accessing data by row number,
651    /// they are always written by default, regardless of whether other statistics
652    /// are enabled. Disabling this metadata may result in a degradation in read
653    /// performance, so use this option with care.
654    ///
655    /// [`Page`]: EnabledStatistics::Page
656    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
657        self.offset_index_disabled = value;
658        self
659    }
660
661    /// Sets "key_value_metadata" property (defaults to `None`).
662    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
663        self.key_value_metadata = value;
664        self
665    }
666
667    /// Sets sorting order of rows in the row group if any (defaults to `None`).
668    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
669        self.sorting_columns = value;
670        self
671    }
672
673    /// Sets the max length of min/max value fields when writing the column
674    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
675    ///
676    /// This can be used to prevent columns with very long values (hundreds of
677    /// bytes long) from causing the parquet metadata to become huge.
678    ///
679    /// # Notes
680    ///
681    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
682    /// set to [`EnabledStatistics::Page`].
683    ///
684    /// * If `Some`, must be greater than 0, otherwise will panic
685    /// * If `None`, there's no effective limit.
686    ///
687    /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
688    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
689        if let Some(value) = max_length {
690            assert!(
691                value > 0,
692                "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
693            );
694        }
695
696        self.column_index_truncate_length = max_length;
697        self
698    }
699
700    /// Sets the max length of min/max value fields in row group and data page header
701    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
702    ///
703    /// # Notes
704    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
705    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
706    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
707    /// [`EnabledStatistics::Page`].
708    ///
709    /// * If `Some`, must be greater than 0, otherwise will panic
710    /// * If `None`, there's no effective limit.
711    ///
712    /// # See also
713    /// Truncation of Page Index statistics is controlled separately via
714    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
715    ///
716    /// [`Statistics`]: crate::file::statistics::Statistics
717    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
718        if let Some(value) = max_length {
719            assert!(
720                value > 0,
721                "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
722            );
723        }
724
725        self.statistics_truncate_length = max_length;
726        self
727    }
728
729    /// Should the writer coerce types to parquet native types (defaults to `false` via
730    /// [`DEFAULT_COERCE_TYPES`]).
731    ///
732    /// Leaving this option the default `false` will ensure the exact same data
733    /// written to parquet using this library will be read.
734    ///
735    /// Setting this option to `true` will result in parquet files that can be
736    /// read by more readers, but potentially lose information in the process.
737    ///
738    /// * Types such as [`DataType::Date64`], which have no direct corresponding
739    ///   Parquet type, may be stored with lower precision.
740    ///
741    /// * The internal field names of `List` and `Map` types will be renamed if
742    ///   necessary to match what is required by the newest Parquet specification.
743    ///
744    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
745    ///
746    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
747    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
748    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
749        self.coerce_types = coerce_types;
750        self
751    }
752
753    /// Sets FileEncryptionProperties (defaults to `None`)
754    #[cfg(feature = "encryption")]
755    pub fn with_file_encryption_properties(
756        mut self,
757        file_encryption_properties: Arc<FileEncryptionProperties>,
758    ) -> Self {
759        self.file_encryption_properties = Some(file_encryption_properties);
760        self
761    }
762
763    // ----------------------------------------------------------------------
764    // Setters for any column (global)
765
766    /// Sets default encoding for all columns.
767    ///
768    /// If dictionary is not enabled, this is treated as a primary encoding for all
769    /// columns. In case when dictionary is enabled for any column, this value is
770    /// considered to be a fallback encoding for that column.
771    ///
772    /// # Panics
773    ///
774    /// if dictionary encoding is specified, regardless of dictionary
775    /// encoding flag being set.
776    pub fn set_encoding(mut self, value: Encoding) -> Self {
777        self.default_column_properties.set_encoding(value);
778        self
779    }
780
781    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
782    /// [`DEFAULT_COMPRESSION`]).
783    ///
784    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
785    pub fn set_compression(mut self, value: Compression) -> Self {
786        self.default_column_properties.set_compression(value);
787        self
788    }
789
790    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
791    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
792    ///
793    /// Use this method to set dictionary encoding, instead of explicitly specifying
794    /// encoding in `set_encoding` method.
795    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
796        self.default_column_properties.set_dictionary_enabled(value);
797        self
798    }
799
800    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
801    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
802    ///
803    /// The parquet writer will attempt to limit the size of each
804    /// `DataPage` used to store dictionaries to this many
805    /// bytes. Reducing this value will result in larger parquet
806    /// files, but may improve the effectiveness of page index based
807    /// predicate pushdown during reading.
808    ///
809    /// Note: this is a best effort limit based on value of
810    /// [`set_write_batch_size`](Self::set_write_batch_size).
811    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
812        self.default_column_properties
813            .set_dictionary_page_size_limit(value);
814        self
815    }
816
817    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
818    /// via [`DEFAULT_PAGE_SIZE`]).
819    ///
820    /// The parquet writer will attempt to limit the sizes of each
821    /// `DataPage` to this many bytes. Reducing this value will result
822    /// in larger parquet files, but may improve the effectiveness of
823    /// page index based predicate pushdown during reading.
824    ///
825    /// Note: this is a best effort limit based on value of
826    /// [`set_write_batch_size`](Self::set_write_batch_size).
827    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
828        self.default_column_properties
829            .set_data_page_size_limit(value);
830        self
831    }
832
833    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
834    /// [`DEFAULT_STATISTICS_ENABLED`]).
835    ///
836    /// [`Page`]: EnabledStatistics::Page
837    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
838        self.default_column_properties.set_statistics_enabled(value);
839        self
840    }
841
842    /// enable/disable writing [`Statistics`] in the page header
843    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
844    ///
845    /// Only applicable if [`Page`] level statistics are gathered.
846    ///
847    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
848    /// file while yielding very little added benefit. Most modern Parquet implementations
849    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
850    /// those in the page header.
851    ///
852    /// # Note
853    ///
854    /// Prior to version 56.0.0, the `parquet` crate always wrote these
855    /// statistics (the equivalent of setting this option to `true`). This was
856    /// changed in 56.0.0 to follow the recommendation in the Parquet
857    /// specification. See [issue #7580] for more details.
858    ///
859    /// [`Statistics`]: crate::file::statistics::Statistics
860    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
861    /// [`Page`]: EnabledStatistics::Page
862    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
863    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
864        self.default_column_properties
865            .set_write_page_header_statistics(value);
866        self
867    }
868
869    /// Sets if bloom filter should be written for all columns (defaults to `false`).
870    ///
871    /// # Notes
872    ///
873    /// * If the bloom filter is enabled previously then it is a no-op.
874    ///
875    /// * If the bloom filter is not enabled, default values for ndv and fpp
876    ///   value are used used. See [`set_bloom_filter_ndv`] and
877    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
878    ///
879    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
880    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
881    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
882        self.default_column_properties
883            .set_bloom_filter_enabled(value);
884        self
885    }
886
887    /// Sets the default target bloom filter false positive probability (fpp)
888    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
889    ///
890    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
891    /// been called.
892    ///
893    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
894    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
895        self.default_column_properties.set_bloom_filter_fpp(value);
896        self
897    }
898
899    /// Sets default number of distinct values (ndv) for bloom filter for all
900    /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
901    ///
902    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
903    /// been called.
904    ///
905    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
906    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
907        self.default_column_properties.set_bloom_filter_ndv(value);
908        self
909    }
910
911    // ----------------------------------------------------------------------
912    // Setters for a specific column
913
914    /// Helper method to get existing or new mutable reference of column properties.
915    #[inline]
916    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
917        self.column_properties.entry(col).or_default()
918    }
919
920    /// Sets encoding for a specific column.
921    ///
922    /// Takes precedence over [`Self::set_encoding`].
923    ///
924    /// If dictionary is not enabled, this is treated as a primary encoding for this
925    /// column. In case when dictionary is enabled for this column, either through
926    /// global defaults or explicitly, this value is considered to be a fallback
927    /// encoding for this column.
928    ///
929    /// # Panics
930    /// If user tries to set dictionary encoding here, regardless of dictionary
931    /// encoding flag being set.
932    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
933        self.get_mut_props(col).set_encoding(value);
934        self
935    }
936
937    /// Sets compression codec for a specific column.
938    ///
939    /// Takes precedence over [`Self::set_compression`].
940    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
941        self.get_mut_props(col).set_compression(value);
942        self
943    }
944
945    /// Sets flag to enable/disable dictionary encoding for a specific column.
946    ///
947    /// Takes precedence over [`Self::set_dictionary_enabled`].
948    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
949        self.get_mut_props(col).set_dictionary_enabled(value);
950        self
951    }
952
953    /// Sets dictionary page size limit for a specific column.
954    ///
955    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
956    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
957        self.get_mut_props(col)
958            .set_dictionary_page_size_limit(value);
959        self
960    }
961
962    /// Sets data page size limit for a specific column.
963    ///
964    /// Takes precedence over [`Self::set_data_page_size_limit`].
965    pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
966        self.get_mut_props(col).set_data_page_size_limit(value);
967        self
968    }
969
970    /// Sets [`EnabledStatistics`] level for a specific column.
971    ///
972    /// Takes precedence over [`Self::set_statistics_enabled`].
973    pub fn set_column_statistics_enabled(
974        mut self,
975        col: ColumnPath,
976        value: EnabledStatistics,
977    ) -> Self {
978        self.get_mut_props(col).set_statistics_enabled(value);
979        self
980    }
981
982    /// Sets whether to write [`Statistics`] in the page header for a specific column.
983    ///
984    /// Takes precedence over [`Self::set_write_page_header_statistics`].
985    ///
986    /// [`Statistics`]: crate::file::statistics::Statistics
987    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
988        self.get_mut_props(col)
989            .set_write_page_header_statistics(value);
990        self
991    }
992
993    /// Sets whether a bloom filter should be written for a specific column.
994    ///
995    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
996    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
997        self.get_mut_props(col).set_bloom_filter_enabled(value);
998        self
999    }
1000
1001    /// Sets the false positive probability for bloom filter for a specific column.
1002    ///
1003    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1004    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1005        self.get_mut_props(col).set_bloom_filter_fpp(value);
1006        self
1007    }
1008
1009    /// Sets the number of distinct values for bloom filter for a specific column.
1010    ///
1011    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
1012    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1013        self.get_mut_props(col).set_bloom_filter_ndv(value);
1014        self
1015    }
1016}
1017
1018impl From<WriterProperties> for WriterPropertiesBuilder {
1019    fn from(props: WriterProperties) -> Self {
1020        WriterPropertiesBuilder {
1021            data_page_row_count_limit: props.data_page_row_count_limit,
1022            write_batch_size: props.write_batch_size,
1023            max_row_group_row_count: props.max_row_group_row_count,
1024            max_row_group_bytes: props.max_row_group_bytes,
1025            bloom_filter_position: props.bloom_filter_position,
1026            writer_version: props.writer_version,
1027            created_by: props.created_by,
1028            offset_index_disabled: props.offset_index_disabled,
1029            key_value_metadata: props.key_value_metadata,
1030            default_column_properties: props.default_column_properties,
1031            column_properties: props.column_properties,
1032            sorting_columns: props.sorting_columns,
1033            column_index_truncate_length: props.column_index_truncate_length,
1034            statistics_truncate_length: props.statistics_truncate_length,
1035            coerce_types: props.coerce_types,
1036            #[cfg(feature = "encryption")]
1037            file_encryption_properties: props.file_encryption_properties,
1038        }
1039    }
1040}
1041
1042/// Controls the level of statistics to be computed by the writer and stored in
1043/// the parquet file.
1044///
1045/// Enabling statistics makes the resulting Parquet file larger and requires
1046/// more time to read the parquet footer.
1047///
1048/// Statistics can be used to improve query performance by pruning row groups
1049/// and pages during query execution if the query engine supports evaluating the
1050/// predicate using the statistics.
1051#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1052pub enum EnabledStatistics {
1053    /// Compute no statistics.
1054    None,
1055    /// Compute column chunk-level statistics but not page-level.
1056    ///
1057    /// Setting this option will store one set of statistics for each relevant
1058    /// column for each row group. The more row groups written, the more
1059    /// statistics will be stored.
1060    Chunk,
1061    /// Compute page-level and column chunk-level statistics.
1062    ///
1063    /// Setting this option will store one set of statistics for each relevant
1064    /// column for each row group. In addition, this will enable the writing
1065    /// of the column index (the offset index is always written regardless of
1066    /// this setting). See [`ParquetColumnIndex`] for
1067    /// more information.
1068    ///
1069    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1070    Page,
1071}
1072
1073impl FromStr for EnabledStatistics {
1074    type Err = String;
1075
1076    fn from_str(s: &str) -> Result<Self, Self::Err> {
1077        match s {
1078            "NONE" | "none" => Ok(EnabledStatistics::None),
1079            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1080            "PAGE" | "page" => Ok(EnabledStatistics::Page),
1081            _ => Err(format!("Invalid statistics arg: {s}")),
1082        }
1083    }
1084}
1085
1086impl Default for EnabledStatistics {
1087    fn default() -> Self {
1088        DEFAULT_STATISTICS_ENABLED
1089    }
1090}
1091
1092/// Controls the bloom filter to be computed by the writer.
1093#[derive(Debug, Clone, PartialEq)]
1094pub struct BloomFilterProperties {
1095    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1096    ///
1097    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1098    ///
1099    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1100    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1101    /// e.g. 0.1, 0.05, or 0.001 is recommended.
1102    ///
1103    /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
1104    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1105    /// be known in advance to greatly reduce space usage.
1106    pub fpp: f64,
1107    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1108    ///
1109    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1110    ///
1111    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1112    /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1113    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1114    /// anyway.
1115    ///
1116    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1117    pub ndv: u64,
1118}
1119
1120impl Default for BloomFilterProperties {
1121    fn default() -> Self {
1122        BloomFilterProperties {
1123            fpp: DEFAULT_BLOOM_FILTER_FPP,
1124            ndv: DEFAULT_BLOOM_FILTER_NDV,
1125        }
1126    }
1127}
1128
1129/// Container for column properties that can be changed as part of writer.
1130///
1131/// If a field is `None`, it means that no specific value has been set for this column,
1132/// so some subsequent or default value must be used.
1133#[derive(Debug, Clone, Default, PartialEq)]
1134struct ColumnProperties {
1135    encoding: Option<Encoding>,
1136    codec: Option<Compression>,
1137    data_page_size_limit: Option<usize>,
1138    dictionary_page_size_limit: Option<usize>,
1139    dictionary_enabled: Option<bool>,
1140    statistics_enabled: Option<EnabledStatistics>,
1141    write_page_header_statistics: Option<bool>,
1142    /// bloom filter related properties
1143    bloom_filter_properties: Option<BloomFilterProperties>,
1144}
1145
1146impl ColumnProperties {
1147    /// Sets encoding for this column.
1148    ///
1149    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1150    /// In case when dictionary is enabled for a column, this value is considered to
1151    /// be a fallback encoding.
1152    ///
1153    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1154    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1155    /// for a column.
1156    fn set_encoding(&mut self, value: Encoding) {
1157        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1158            panic!("Dictionary encoding can not be used as fallback encoding");
1159        }
1160        self.encoding = Some(value);
1161    }
1162
1163    /// Sets compression codec for this column.
1164    fn set_compression(&mut self, value: Compression) {
1165        self.codec = Some(value);
1166    }
1167
1168    /// Sets data page size limit for this column.
1169    fn set_data_page_size_limit(&mut self, value: usize) {
1170        self.data_page_size_limit = Some(value);
1171    }
1172
1173    /// Sets whether dictionary encoding is enabled for this column.
1174    fn set_dictionary_enabled(&mut self, enabled: bool) {
1175        self.dictionary_enabled = Some(enabled);
1176    }
1177
1178    /// Sets dictionary page size limit for this column.
1179    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1180        self.dictionary_page_size_limit = Some(value);
1181    }
1182
1183    /// Sets the statistics level for this column.
1184    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1185        self.statistics_enabled = Some(enabled);
1186    }
1187
1188    /// Sets whether to write statistics in the page header for this column.
1189    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1190        self.write_page_header_statistics = Some(enabled);
1191    }
1192
1193    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1194    /// otherwise it is a no-op.
1195    /// If `value` is `false`, resets bloom filter properties to `None`.
1196    fn set_bloom_filter_enabled(&mut self, value: bool) {
1197        if value && self.bloom_filter_properties.is_none() {
1198            self.bloom_filter_properties = Some(Default::default())
1199        } else if !value {
1200            self.bloom_filter_properties = None
1201        }
1202    }
1203
1204    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1205    /// bloom filter if not previously enabled.
1206    ///
1207    /// # Panics
1208    ///
1209    /// Panics if the `value` is not between 0 and 1 exclusive
1210    fn set_bloom_filter_fpp(&mut self, value: f64) {
1211        assert!(
1212            value > 0. && value < 1.0,
1213            "fpp must be between 0 and 1 exclusive, got {value}"
1214        );
1215
1216        self.bloom_filter_properties
1217            .get_or_insert_with(Default::default)
1218            .fpp = value;
1219    }
1220
1221    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1222    /// enables bloom filter if not previously enabled.
1223    fn set_bloom_filter_ndv(&mut self, value: u64) {
1224        self.bloom_filter_properties
1225            .get_or_insert_with(Default::default)
1226            .ndv = value;
1227    }
1228
1229    /// Returns optional encoding for this column.
1230    fn encoding(&self) -> Option<Encoding> {
1231        self.encoding
1232    }
1233
1234    /// Returns optional compression codec for this column.
1235    fn compression(&self) -> Option<Compression> {
1236        self.codec
1237    }
1238
1239    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1240    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1241    /// been provided.
1242    fn dictionary_enabled(&self) -> Option<bool> {
1243        self.dictionary_enabled
1244    }
1245
1246    /// Returns optional dictionary page size limit for this column.
1247    fn dictionary_page_size_limit(&self) -> Option<usize> {
1248        self.dictionary_page_size_limit
1249    }
1250
1251    /// Returns optional data page size limit for this column.
1252    fn data_page_size_limit(&self) -> Option<usize> {
1253        self.data_page_size_limit
1254    }
1255
1256    /// Returns optional statistics level requested for this column. If result is `None`,
1257    /// then no setting has been provided.
1258    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1259        self.statistics_enabled
1260    }
1261
1262    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1263    /// column.
1264    ///
1265    /// [`Statistics`]: crate::file::statistics::Statistics
1266    fn write_page_header_statistics(&self) -> Option<bool> {
1267        self.write_page_header_statistics
1268    }
1269
1270    /// Returns the bloom filter properties, or `None` if not enabled
1271    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1272        self.bloom_filter_properties.as_ref()
1273    }
1274}
1275
1276/// Reference counted reader properties.
1277pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1278
1279const DEFAULT_READ_BLOOM_FILTER: bool = false;
1280const DEFAULT_READ_PAGE_STATS: bool = false;
1281
1282/// Configuration settings for reading parquet files.
1283///
1284/// All properties are immutable and `Send` + `Sync`.
1285/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1286///
1287/// # Example
1288///
1289/// ```rust
1290/// use parquet::file::properties::ReaderProperties;
1291///
1292/// // Create properties with default configuration.
1293/// let props = ReaderProperties::builder().build();
1294///
1295/// // Use properties builder to set certain options and assemble the configuration.
1296/// let props = ReaderProperties::builder()
1297///     .set_backward_compatible_lz4(false)
1298///     .build();
1299/// ```
1300pub struct ReaderProperties {
1301    codec_options: CodecOptions,
1302    read_bloom_filter: bool,
1303    read_page_stats: bool,
1304}
1305
1306impl ReaderProperties {
1307    /// Returns builder for reader properties with default values.
1308    pub fn builder() -> ReaderPropertiesBuilder {
1309        ReaderPropertiesBuilder::with_defaults()
1310    }
1311
1312    /// Returns codec options.
1313    pub(crate) fn codec_options(&self) -> &CodecOptions {
1314        &self.codec_options
1315    }
1316
1317    /// Returns whether to read bloom filter
1318    pub(crate) fn read_bloom_filter(&self) -> bool {
1319        self.read_bloom_filter
1320    }
1321
1322    /// Returns whether to read page level statistics
1323    pub(crate) fn read_page_stats(&self) -> bool {
1324        self.read_page_stats
1325    }
1326}
1327
1328/// Builder for parquet file reader configuration. See example on
1329/// [`ReaderProperties`]
1330pub struct ReaderPropertiesBuilder {
1331    codec_options_builder: CodecOptionsBuilder,
1332    read_bloom_filter: Option<bool>,
1333    read_page_stats: Option<bool>,
1334}
1335
1336/// Reader properties builder.
1337impl ReaderPropertiesBuilder {
1338    /// Returns default state of the builder.
1339    fn with_defaults() -> Self {
1340        Self {
1341            codec_options_builder: CodecOptionsBuilder::default(),
1342            read_bloom_filter: None,
1343            read_page_stats: None,
1344        }
1345    }
1346
1347    /// Finalizes the configuration and returns immutable reader properties struct.
1348    pub fn build(self) -> ReaderProperties {
1349        ReaderProperties {
1350            codec_options: self.codec_options_builder.build(),
1351            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1352            read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1353        }
1354    }
1355
1356    /// Enable/disable backward compatible LZ4.
1357    ///
1358    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1359    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1360    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1361    /// compatibility with files generated by older versions of parquet-cpp.
1362    ///
1363    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1364    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1365        self.codec_options_builder = self
1366            .codec_options_builder
1367            .set_backward_compatible_lz4(value);
1368        self
1369    }
1370
1371    /// Enable/disable reading bloom filter
1372    ///
1373    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1374    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1375    ///
1376    /// By default bloom filter is set to be read.
1377    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1378        self.read_bloom_filter = Some(value);
1379        self
1380    }
1381
1382    /// Enable/disable reading page-level statistics
1383    ///
1384    /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1385    /// each page, if present.
1386    /// If set to `false`, then the reader will skip decoding the statistics.
1387    ///
1388    /// By default statistics will not be decoded.
1389    ///
1390    /// [`Statistics`]: crate::file::statistics::Statistics
1391    pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1392        self.read_page_stats = Some(value);
1393        self
1394    }
1395}
1396
1397#[cfg(test)]
1398mod tests {
1399    use super::*;
1400
1401    #[test]
1402    fn test_writer_version() {
1403        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1404        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1405    }
1406
1407    #[test]
1408    fn test_writer_properties_default_settings() {
1409        let props = WriterProperties::default();
1410        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1411        assert_eq!(
1412            props.dictionary_page_size_limit(),
1413            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1414        );
1415        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1416        assert_eq!(
1417            props.max_row_group_row_count(),
1418            Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1419        );
1420        assert_eq!(props.max_row_group_bytes(), None);
1421        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1422        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1423        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1424        assert_eq!(props.key_value_metadata(), None);
1425        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1426        assert_eq!(
1427            props.compression(&ColumnPath::from("col")),
1428            DEFAULT_COMPRESSION
1429        );
1430        assert_eq!(
1431            props.dictionary_enabled(&ColumnPath::from("col")),
1432            DEFAULT_DICTIONARY_ENABLED
1433        );
1434        assert_eq!(
1435            props.statistics_enabled(&ColumnPath::from("col")),
1436            DEFAULT_STATISTICS_ENABLED
1437        );
1438        assert!(
1439            props
1440                .bloom_filter_properties(&ColumnPath::from("col"))
1441                .is_none()
1442        );
1443    }
1444
1445    #[test]
1446    fn test_writer_properties_dictionary_encoding() {
1447        // dictionary encoding is not configurable, and it should be the same for both
1448        // writer version 1 and 2.
1449        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1450            let props = WriterProperties::builder()
1451                .set_writer_version(*version)
1452                .build();
1453            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1454            assert_eq!(
1455                props.dictionary_data_page_encoding(),
1456                Encoding::RLE_DICTIONARY
1457            );
1458        }
1459    }
1460
1461    #[test]
1462    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1463    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1464        // Should panic when user specifies dictionary encoding as fallback encoding.
1465        WriterProperties::builder()
1466            .set_encoding(Encoding::PLAIN_DICTIONARY)
1467            .build();
1468    }
1469
1470    #[test]
1471    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1472    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1473        // Should panic when user specifies dictionary encoding as fallback encoding.
1474        WriterProperties::builder()
1475            .set_encoding(Encoding::RLE_DICTIONARY)
1476            .build();
1477    }
1478
1479    #[test]
1480    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1481    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1482        WriterProperties::builder()
1483            .set_dictionary_enabled(true)
1484            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1485            .build();
1486    }
1487
1488    #[test]
1489    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1490    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1491        WriterProperties::builder()
1492            .set_dictionary_enabled(false)
1493            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1494            .build();
1495    }
1496
1497    #[test]
1498    fn test_writer_properties_builder() {
1499        let props = WriterProperties::builder()
1500            // file settings
1501            .set_writer_version(WriterVersion::PARQUET_2_0)
1502            .set_data_page_size_limit(10)
1503            .set_dictionary_page_size_limit(20)
1504            .set_write_batch_size(30)
1505            .set_max_row_group_row_count(Some(40))
1506            .set_created_by("default".to_owned())
1507            .set_key_value_metadata(Some(vec![KeyValue::new(
1508                "key".to_string(),
1509                "value".to_string(),
1510            )]))
1511            // global column settings
1512            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1513            .set_compression(Compression::GZIP(Default::default()))
1514            .set_dictionary_enabled(false)
1515            .set_statistics_enabled(EnabledStatistics::None)
1516            // specific column settings
1517            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1518            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1519            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1520            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1521            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1522            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1523            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1524            .build();
1525
1526        fn test_props(props: &WriterProperties) {
1527            assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1528            assert_eq!(props.data_page_size_limit(), 10);
1529            assert_eq!(props.dictionary_page_size_limit(), 20);
1530            assert_eq!(props.write_batch_size(), 30);
1531            assert_eq!(props.max_row_group_row_count(), Some(40));
1532            assert_eq!(props.created_by(), "default");
1533            assert_eq!(
1534                props.key_value_metadata(),
1535                Some(&vec![
1536                    KeyValue::new("key".to_string(), "value".to_string(),)
1537                ])
1538            );
1539
1540            assert_eq!(
1541                props.encoding(&ColumnPath::from("a")),
1542                Some(Encoding::DELTA_BINARY_PACKED)
1543            );
1544            assert_eq!(
1545                props.compression(&ColumnPath::from("a")),
1546                Compression::GZIP(Default::default())
1547            );
1548            assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1549            assert_eq!(
1550                props.statistics_enabled(&ColumnPath::from("a")),
1551                EnabledStatistics::None
1552            );
1553
1554            assert_eq!(
1555                props.encoding(&ColumnPath::from("col")),
1556                Some(Encoding::RLE)
1557            );
1558            assert_eq!(
1559                props.compression(&ColumnPath::from("col")),
1560                Compression::SNAPPY
1561            );
1562            assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1563            assert_eq!(
1564                props.statistics_enabled(&ColumnPath::from("col")),
1565                EnabledStatistics::Chunk
1566            );
1567            assert_eq!(
1568                props.bloom_filter_properties(&ColumnPath::from("col")),
1569                Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1570            );
1571        }
1572
1573        // Test direct build of properties
1574        test_props(&props);
1575
1576        // Test that into_builder() gives the same result
1577        let props_into_builder_and_back = props.into_builder().build();
1578        test_props(&props_into_builder_and_back);
1579    }
1580
1581    #[test]
1582    fn test_writer_properties_builder_partial_defaults() {
1583        let props = WriterProperties::builder()
1584            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1585            .set_compression(Compression::GZIP(Default::default()))
1586            .set_bloom_filter_enabled(true)
1587            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1588            .build();
1589
1590        assert_eq!(
1591            props.encoding(&ColumnPath::from("col")),
1592            Some(Encoding::RLE)
1593        );
1594        assert_eq!(
1595            props.compression(&ColumnPath::from("col")),
1596            Compression::GZIP(Default::default())
1597        );
1598        assert_eq!(
1599            props.dictionary_enabled(&ColumnPath::from("col")),
1600            DEFAULT_DICTIONARY_ENABLED
1601        );
1602        assert_eq!(
1603            props.bloom_filter_properties(&ColumnPath::from("col")),
1604            Some(&BloomFilterProperties {
1605                fpp: 0.05,
1606                ndv: 1_000_000_u64
1607            })
1608        );
1609    }
1610
1611    #[test]
1612    #[allow(deprecated)]
1613    fn test_writer_properties_deprecated_max_row_group_size_still_works() {
1614        let props = WriterProperties::builder()
1615            .set_max_row_group_size(42)
1616            .build();
1617
1618        assert_eq!(props.max_row_group_row_count(), Some(42));
1619        assert_eq!(props.max_row_group_size(), 42);
1620    }
1621
1622    #[test]
1623    #[should_panic(expected = "Cannot have a 0 max row group row count")]
1624    fn test_writer_properties_panic_on_zero_row_group_row_count() {
1625        let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
1626    }
1627
1628    #[test]
1629    #[should_panic(expected = "Cannot have a 0 max row group bytes")]
1630    fn test_writer_properties_panic_on_zero_row_group_bytes() {
1631        let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
1632    }
1633
1634    #[test]
1635    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1636        assert_eq!(
1637            WriterProperties::builder()
1638                .build()
1639                .bloom_filter_properties(&ColumnPath::from("col")),
1640            None
1641        );
1642        assert_eq!(
1643            WriterProperties::builder()
1644                .set_bloom_filter_ndv(100)
1645                .build()
1646                .bloom_filter_properties(&ColumnPath::from("col")),
1647            Some(&BloomFilterProperties {
1648                fpp: 0.05,
1649                ndv: 100
1650            })
1651        );
1652        assert_eq!(
1653            WriterProperties::builder()
1654                .set_bloom_filter_fpp(0.1)
1655                .build()
1656                .bloom_filter_properties(&ColumnPath::from("col")),
1657            Some(&BloomFilterProperties {
1658                fpp: 0.1,
1659                ndv: 1_000_000_u64
1660            })
1661        );
1662    }
1663
1664    #[test]
1665    fn test_writer_properties_column_dictionary_page_size_limit() {
1666        let props = WriterProperties::builder()
1667            .set_dictionary_page_size_limit(100)
1668            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1669            .build();
1670
1671        assert_eq!(props.dictionary_page_size_limit(), 100);
1672        assert_eq!(
1673            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1674            10
1675        );
1676        assert_eq!(
1677            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1678            100
1679        );
1680    }
1681
1682    #[test]
1683    fn test_writer_properties_column_data_page_size_limit() {
1684        let props = WriterProperties::builder()
1685            .set_data_page_size_limit(100)
1686            .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
1687            .build();
1688
1689        assert_eq!(props.data_page_size_limit(), 100);
1690        assert_eq!(
1691            props.column_data_page_size_limit(&ColumnPath::from("col")),
1692            10
1693        );
1694        assert_eq!(
1695            props.column_data_page_size_limit(&ColumnPath::from("other")),
1696            100
1697        );
1698    }
1699
1700    #[test]
1701    fn test_reader_properties_default_settings() {
1702        let props = ReaderProperties::builder().build();
1703
1704        let codec_options = CodecOptionsBuilder::default()
1705            .set_backward_compatible_lz4(true)
1706            .build();
1707
1708        assert_eq!(props.codec_options(), &codec_options);
1709        assert!(!props.read_bloom_filter());
1710    }
1711
1712    #[test]
1713    fn test_reader_properties_builder() {
1714        let props = ReaderProperties::builder()
1715            .set_backward_compatible_lz4(false)
1716            .build();
1717
1718        let codec_options = CodecOptionsBuilder::default()
1719            .set_backward_compatible_lz4(false)
1720            .build();
1721
1722        assert_eq!(props.codec_options(), &codec_options);
1723    }
1724
1725    #[test]
1726    fn test_parse_writerversion() {
1727        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1728        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1729        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1730        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1731
1732        // test lowercase
1733        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1734        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1735
1736        // test invalid version
1737        match "PARQUET_-1_0".parse::<WriterVersion>() {
1738            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1739            Err(e) => {
1740                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1741            }
1742        }
1743    }
1744
1745    #[test]
1746    fn test_parse_enabledstatistics() {
1747        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1748        assert_eq!(enabled_statistics, EnabledStatistics::None);
1749        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1750        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1751        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1752        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1753
1754        // test lowercase
1755        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1756        assert_eq!(enabled_statistics, EnabledStatistics::None);
1757
1758        //test invalid statistics
1759        match "ChunkAndPage".parse::<EnabledStatistics>() {
1760            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1761            Err(e) => {
1762                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1763            }
1764        }
1765    }
1766}