parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::write_page_header_statistics`]
46pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
47/// Default value for [`WriterProperties::max_row_group_size`]
48pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
49/// Default value for [`WriterProperties::bloom_filter_position`]
50pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
51/// Default value for [`WriterProperties::created_by`]
52pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
53/// Default value for [`WriterProperties::column_index_truncate_length`]
54pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
55/// Default value for [`BloomFilterProperties::fpp`]
56pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
57/// Default value for [`BloomFilterProperties::ndv`]
58pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
59/// Default values for [`WriterProperties::statistics_truncate_length`]
60pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
61/// Default value for [`WriterProperties::offset_index_disabled`]
62pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
63/// Default values for [`WriterProperties::coerce_types`]
64pub const DEFAULT_COERCE_TYPES: bool = false;
65
66/// Parquet writer version.
67///
68/// Basic constant, which is not part of the Thrift definition.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70#[allow(non_camel_case_types)]
71pub enum WriterVersion {
72    /// Parquet format version 1.0
73    PARQUET_1_0,
74    /// Parquet format version 2.0
75    PARQUET_2_0,
76}
77
78impl WriterVersion {
79    /// Returns writer version as `i32`.
80    pub fn as_num(&self) -> i32 {
81        match self {
82            WriterVersion::PARQUET_1_0 => 1,
83            WriterVersion::PARQUET_2_0 => 2,
84        }
85    }
86}
87
88impl FromStr for WriterVersion {
89    type Err = String;
90
91    fn from_str(s: &str) -> Result<Self, Self::Err> {
92        match s {
93            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
94            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
95            _ => Err(format!("Invalid writer version: {s}")),
96        }
97    }
98}
99
100/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
101/// write Bloom filters
102///
103/// Basic constant, which is not part of the Thrift definition.
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum BloomFilterPosition {
106    /// Write Bloom Filters of each row group right after the row group
107    ///
108    /// This saves memory by writing it as soon as it is computed, at the cost
109    /// of data locality for readers
110    AfterRowGroup,
111    /// Write Bloom Filters at the end of the file
112    ///
113    /// This allows better data locality for readers, at the cost of memory usage
114    /// for writers.
115    End,
116}
117
118/// Reference counted writer properties.
119pub type WriterPropertiesPtr = Arc<WriterProperties>;
120
121/// Configuration settings for writing parquet files.
122///
123/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
124///
125/// # Example
126///
127/// ```rust
128/// # use parquet::{
129/// #    basic::{Compression, Encoding},
130/// #    file::properties::*,
131/// #    schema::types::ColumnPath,
132/// # };
133/// #
134/// // Create properties with default configuration.
135/// let props = WriterProperties::default();
136///
137/// // Use properties builder to set certain options and assemble the configuration.
138/// let props = WriterProperties::builder()
139///     .set_writer_version(WriterVersion::PARQUET_1_0)
140///     .set_encoding(Encoding::PLAIN)
141///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
142///     .set_compression(Compression::SNAPPY)
143///     .build();
144///
145/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
146/// assert_eq!(
147///     props.encoding(&ColumnPath::from("col1")),
148///     Some(Encoding::DELTA_BINARY_PACKED)
149/// );
150/// assert_eq!(
151///     props.encoding(&ColumnPath::from("col2")),
152///     Some(Encoding::PLAIN)
153/// );
154/// ```
155#[derive(Debug, Clone)]
156pub struct WriterProperties {
157    data_page_size_limit: usize,
158    data_page_row_count_limit: usize,
159    write_batch_size: usize,
160    max_row_group_size: usize,
161    bloom_filter_position: BloomFilterPosition,
162    writer_version: WriterVersion,
163    created_by: String,
164    offset_index_disabled: bool,
165    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
166    default_column_properties: ColumnProperties,
167    column_properties: HashMap<ColumnPath, ColumnProperties>,
168    sorting_columns: Option<Vec<SortingColumn>>,
169    column_index_truncate_length: Option<usize>,
170    statistics_truncate_length: Option<usize>,
171    coerce_types: bool,
172    #[cfg(feature = "encryption")]
173    pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
174}
175
176impl Default for WriterProperties {
177    fn default() -> Self {
178        Self::builder().build()
179    }
180}
181
182impl WriterProperties {
183    /// Create a new [`WriterProperties`] with the default settings
184    ///
185    /// See [`WriterProperties::builder`] for customising settings
186    pub fn new() -> Self {
187        Self::default()
188    }
189
190    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
191    /// properties.
192    pub fn builder() -> WriterPropertiesBuilder {
193        WriterPropertiesBuilder::with_defaults()
194    }
195
196    /// Returns data page size limit.
197    ///
198    /// Note: this is a best effort limit based on the write batch size
199    ///
200    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
201    pub fn data_page_size_limit(&self) -> usize {
202        self.data_page_size_limit
203    }
204
205    /// Returns dictionary page size limit.
206    ///
207    /// Note: this is a best effort limit based on the write batch size
208    ///
209    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
210    pub fn dictionary_page_size_limit(&self) -> usize {
211        self.default_column_properties
212            .dictionary_page_size_limit()
213            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
214    }
215
216    /// Returns dictionary page size limit for a specific column.
217    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
218        self.column_properties
219            .get(col)
220            .and_then(|c| c.dictionary_page_size_limit())
221            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
222            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
223    }
224
225    /// Returns the maximum page row count
226    ///
227    /// Note: this is a best effort limit based on the write batch size
228    ///
229    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
230    pub fn data_page_row_count_limit(&self) -> usize {
231        self.data_page_row_count_limit
232    }
233
234    /// Returns configured batch size for writes.
235    ///
236    /// When writing a batch of data, this setting allows to split it internally into
237    /// smaller batches so we can better estimate the size of a page currently being
238    /// written.
239    ///
240    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
241    pub fn write_batch_size(&self) -> usize {
242        self.write_batch_size
243    }
244
245    /// Returns maximum number of rows in a row group.
246    ///
247    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
248    pub fn max_row_group_size(&self) -> usize {
249        self.max_row_group_size
250    }
251
252    /// Returns bloom filter position.
253    ///
254    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
255    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
256        self.bloom_filter_position
257    }
258
259    /// Returns configured writer version.
260    ///
261    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
262    pub fn writer_version(&self) -> WriterVersion {
263        self.writer_version
264    }
265
266    /// Returns `created_by` string.
267    ///
268    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
269    pub fn created_by(&self) -> &str {
270        &self.created_by
271    }
272
273    /// Returns `true` if offset index writing is disabled.
274    ///
275    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
276    pub fn offset_index_disabled(&self) -> bool {
277        // If page statistics are to be collected, then do not disable the offset indexes.
278        let default_page_stats_enabled =
279            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
280        let column_page_stats_enabled = self
281            .column_properties
282            .iter()
283            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
284        if default_page_stats_enabled || column_page_stats_enabled {
285            return false;
286        }
287
288        self.offset_index_disabled
289    }
290
291    /// Returns `key_value_metadata` KeyValue pairs.
292    ///
293    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
294    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
295        self.key_value_metadata.as_ref()
296    }
297
298    /// Returns sorting columns.
299    ///
300    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
301    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
302        self.sorting_columns.as_ref()
303    }
304
305    /// Returns the maximum length of truncated min/max values in the column index.
306    ///
307    /// `None` if truncation is disabled, must be greater than 0 otherwise.
308    ///
309    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
310    pub fn column_index_truncate_length(&self) -> Option<usize> {
311        self.column_index_truncate_length
312    }
313
314    /// Returns the maximum length of truncated min/max values in [`Statistics`].
315    ///
316    /// `None` if truncation is disabled, must be greater than 0 otherwise.
317    ///
318    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
319    ///
320    /// [`Statistics`]: crate::file::statistics::Statistics
321    pub fn statistics_truncate_length(&self) -> Option<usize> {
322        self.statistics_truncate_length
323    }
324
325    /// Returns `true` if type coercion is enabled.
326    ///
327    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
328    pub fn coerce_types(&self) -> bool {
329        self.coerce_types
330    }
331
332    /// Returns encoding for a data page, when dictionary encoding is enabled.
333    ///
334    /// This is not configurable.
335    #[inline]
336    pub fn dictionary_data_page_encoding(&self) -> Encoding {
337        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
338        // Dictionary values are encoded using RLE_DICTIONARY encoding.
339        Encoding::RLE_DICTIONARY
340    }
341
342    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
343    ///
344    /// This is not configurable.
345    #[inline]
346    pub fn dictionary_page_encoding(&self) -> Encoding {
347        // PLAIN_DICTIONARY is deprecated in writer version 1.
348        // Dictionary is encoded using plain encoding.
349        Encoding::PLAIN
350    }
351
352    /// Returns encoding for a column, if set.
353    ///
354    /// In case when dictionary is enabled, returns fallback encoding.
355    ///
356    /// If encoding is not set, then column writer will choose the best encoding
357    /// based on the column type.
358    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
359        self.column_properties
360            .get(col)
361            .and_then(|c| c.encoding())
362            .or_else(|| self.default_column_properties.encoding())
363    }
364
365    /// Returns compression codec for a column.
366    ///
367    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
368    pub fn compression(&self, col: &ColumnPath) -> Compression {
369        self.column_properties
370            .get(col)
371            .and_then(|c| c.compression())
372            .or_else(|| self.default_column_properties.compression())
373            .unwrap_or(DEFAULT_COMPRESSION)
374    }
375
376    /// Returns `true` if dictionary encoding is enabled for a column.
377    ///
378    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
379    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
380        self.column_properties
381            .get(col)
382            .and_then(|c| c.dictionary_enabled())
383            .or_else(|| self.default_column_properties.dictionary_enabled())
384            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
385    }
386
387    /// Returns which statistics are written for a column.
388    ///
389    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
390    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
391        self.column_properties
392            .get(col)
393            .and_then(|c| c.statistics_enabled())
394            .or_else(|| self.default_column_properties.statistics_enabled())
395            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
396    }
397
398    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
399    ///
400    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
401    ///
402    /// [`Statistics`]: crate::file::statistics::Statistics
403    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
404        self.column_properties
405            .get(col)
406            .and_then(|c| c.write_page_header_statistics())
407            .or_else(|| {
408                self.default_column_properties
409                    .write_page_header_statistics()
410            })
411            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
412    }
413
414    /// Returns the [`BloomFilterProperties`] for the given column
415    ///
416    /// Returns `None` if bloom filter is disabled
417    ///
418    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
419    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
420        self.column_properties
421            .get(col)
422            .and_then(|c| c.bloom_filter_properties())
423            .or_else(|| self.default_column_properties.bloom_filter_properties())
424    }
425
426    /// Return file encryption properties
427    ///
428    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
429    #[cfg(feature = "encryption")]
430    pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
431        self.file_encryption_properties.as_ref()
432    }
433}
434
435/// Builder for  [`WriterProperties`] Parquet writer configuration.
436///
437/// See example on [`WriterProperties`]
438pub struct WriterPropertiesBuilder {
439    data_page_size_limit: usize,
440    data_page_row_count_limit: usize,
441    write_batch_size: usize,
442    max_row_group_size: usize,
443    bloom_filter_position: BloomFilterPosition,
444    writer_version: WriterVersion,
445    created_by: String,
446    offset_index_disabled: bool,
447    key_value_metadata: Option<Vec<KeyValue>>,
448    default_column_properties: ColumnProperties,
449    column_properties: HashMap<ColumnPath, ColumnProperties>,
450    sorting_columns: Option<Vec<SortingColumn>>,
451    column_index_truncate_length: Option<usize>,
452    statistics_truncate_length: Option<usize>,
453    coerce_types: bool,
454    #[cfg(feature = "encryption")]
455    file_encryption_properties: Option<FileEncryptionProperties>,
456}
457
458impl WriterPropertiesBuilder {
459    /// Returns default state of the builder.
460    fn with_defaults() -> Self {
461        Self {
462            data_page_size_limit: DEFAULT_PAGE_SIZE,
463            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
464            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
465            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
466            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
467            writer_version: DEFAULT_WRITER_VERSION,
468            created_by: DEFAULT_CREATED_BY.to_string(),
469            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
470            key_value_metadata: None,
471            default_column_properties: Default::default(),
472            column_properties: HashMap::new(),
473            sorting_columns: None,
474            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
475            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
476            coerce_types: DEFAULT_COERCE_TYPES,
477            #[cfg(feature = "encryption")]
478            file_encryption_properties: None,
479        }
480    }
481
482    /// Finalizes the configuration and returns immutable writer properties struct.
483    pub fn build(self) -> WriterProperties {
484        WriterProperties {
485            data_page_size_limit: self.data_page_size_limit,
486            data_page_row_count_limit: self.data_page_row_count_limit,
487            write_batch_size: self.write_batch_size,
488            max_row_group_size: self.max_row_group_size,
489            bloom_filter_position: self.bloom_filter_position,
490            writer_version: self.writer_version,
491            created_by: self.created_by,
492            offset_index_disabled: self.offset_index_disabled,
493            key_value_metadata: self.key_value_metadata,
494            default_column_properties: self.default_column_properties,
495            column_properties: self.column_properties,
496            sorting_columns: self.sorting_columns,
497            column_index_truncate_length: self.column_index_truncate_length,
498            statistics_truncate_length: self.statistics_truncate_length,
499            coerce_types: self.coerce_types,
500            #[cfg(feature = "encryption")]
501            file_encryption_properties: self.file_encryption_properties,
502        }
503    }
504
505    // ----------------------------------------------------------------------
506    // Writer properties related to a file
507
508    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
509    /// via [`DEFAULT_WRITER_VERSION`])
510    ///
511    /// This value can determine what features some readers will support.
512    ///
513    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
514    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
515        self.writer_version = value;
516        self
517    }
518
519    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
520    /// via [`DEFAULT_PAGE_SIZE`]).
521    ///
522    /// The parquet writer will attempt to limit the sizes of each
523    /// `DataPage` to this many bytes. Reducing this value will result
524    /// in larger parquet files, but may improve the effectiveness of
525    /// page index based predicate pushdown during reading.
526    ///
527    /// Note: this is a best effort limit based on value of
528    /// [`set_write_batch_size`](Self::set_write_batch_size).
529    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
530        self.data_page_size_limit = value;
531        self
532    }
533
534    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
535    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
536    ///
537    /// The parquet writer will attempt to limit the number of rows in
538    /// each `DataPage` to this value. Reducing this value will result
539    /// in larger parquet files, but may improve the effectiveness of
540    /// page index based predicate pushdown during reading.
541    ///
542    /// Note: this is a best effort limit based on value of
543    /// [`set_write_batch_size`](Self::set_write_batch_size).
544    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
545        self.data_page_row_count_limit = value;
546        self
547    }
548
549    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
550    ///
551    /// For performance reasons, data for each column is written in
552    /// batches of this size.
553    ///
554    /// Additional limits such as such as
555    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
556    /// are checked between batches, and thus the write batch size value acts as an
557    /// upper-bound on the enforcement granularity of other limits.
558    pub fn set_write_batch_size(mut self, value: usize) -> Self {
559        self.write_batch_size = value;
560        self
561    }
562
563    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
564    /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
565    ///
566    /// # Panics
567    /// If the value is set to 0.
568    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
569        assert!(value > 0, "Cannot have a 0 max row group size");
570        self.max_row_group_size = value;
571        self
572    }
573
574    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
575    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
576    ///
577    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
578    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
579        self.bloom_filter_position = value;
580        self
581    }
582
583    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
584    /// [`DEFAULT_CREATED_BY`]).
585    ///
586    /// This is a string that will be written into the file metadata
587    pub fn set_created_by(mut self, value: String) -> Self {
588        self.created_by = value;
589        self
590    }
591
592    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
593    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
594    ///
595    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
596    ///
597    /// Note: As the offset indexes are useful for accessing data by row number,
598    /// they are always written by default, regardless of whether other statistics
599    /// are enabled. Disabling this metadata may result in a degradation in read
600    /// performance, so use this option with care.
601    ///
602    /// [`Page`]: EnabledStatistics::Page
603    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
604        self.offset_index_disabled = value;
605        self
606    }
607
608    /// Sets "key_value_metadata" property (defaults to `None`).
609    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
610        self.key_value_metadata = value;
611        self
612    }
613
614    /// Sets sorting order of rows in the row group if any (defaults to `None`).
615    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
616        self.sorting_columns = value;
617        self
618    }
619
620    /// Sets the max length of min/max value fields when writing the column
621    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
622    ///
623    /// This can be used to prevent columns with very long values (hundreds of
624    /// bytes long) from causing the parquet metadata to become huge.
625    ///
626    /// # Notes
627    ///
628    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
629    /// set to [`EnabledStatistics::Page`].
630    ///
631    /// * If `Some`, must be greater than 0, otherwise will panic
632    /// * If `None`, there's no effective limit.
633    ///
634    /// [`Index`]: crate::file::page_index::index::Index
635    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
636        if let Some(value) = max_length {
637            assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
638        }
639
640        self.column_index_truncate_length = max_length;
641        self
642    }
643
644    /// Sets the max length of min/max value fields in row group and data page header
645    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
646    ///
647    /// # Notes
648    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
649    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
650    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
651    /// [`EnabledStatistics::Page`].
652    ///
653    /// * If `Some`, must be greater than 0, otherwise will panic
654    /// * If `None`, there's no effective limit.
655    ///
656    /// # See also
657    /// Truncation of Page Index statistics is controlled separately via
658    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
659    ///
660    /// [`Statistics`]: crate::file::statistics::Statistics
661    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
662        if let Some(value) = max_length {
663            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
664        }
665
666        self.statistics_truncate_length = max_length;
667        self
668    }
669
670    /// Should the writer coerce types to parquet native types (defaults to `false` via
671    /// [`DEFAULT_COERCE_TYPES`]).
672    ///
673    /// Leaving this option the default `false` will ensure the exact same data
674    /// written to parquet using this library will be read.
675    ///
676    /// Setting this option to `true` will result in parquet files that can be
677    /// read by more readers, but potentially lose information in the process.
678    ///
679    /// * Types such as [`DataType::Date64`], which have no direct corresponding
680    ///   Parquet type, may be stored with lower precision.
681    ///
682    /// * The internal field names of `List` and `Map` types will be renamed if
683    ///   necessary to match what is required by the newest Parquet specification.
684    ///
685    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
686    ///
687    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
688    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
689    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
690        self.coerce_types = coerce_types;
691        self
692    }
693
694    /// Sets FileEncryptionProperties (defaults to `None`)
695    #[cfg(feature = "encryption")]
696    pub fn with_file_encryption_properties(
697        mut self,
698        file_encryption_properties: FileEncryptionProperties,
699    ) -> Self {
700        self.file_encryption_properties = Some(file_encryption_properties);
701        self
702    }
703
704    // ----------------------------------------------------------------------
705    // Setters for any column (global)
706
707    /// Sets default encoding for all columns.
708    ///
709    /// If dictionary is not enabled, this is treated as a primary encoding for all
710    /// columns. In case when dictionary is enabled for any column, this value is
711    /// considered to be a fallback encoding for that column.
712    ///
713    /// # Panics
714    ///
715    /// if dictionary encoding is specified, regardless of dictionary
716    /// encoding flag being set.
717    pub fn set_encoding(mut self, value: Encoding) -> Self {
718        self.default_column_properties.set_encoding(value);
719        self
720    }
721
722    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
723    /// [`DEFAULT_COMPRESSION`]).
724    ///
725    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
726    pub fn set_compression(mut self, value: Compression) -> Self {
727        self.default_column_properties.set_compression(value);
728        self
729    }
730
731    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
732    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
733    ///
734    /// Use this method to set dictionary encoding, instead of explicitly specifying
735    /// encoding in `set_encoding` method.
736    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
737        self.default_column_properties.set_dictionary_enabled(value);
738        self
739    }
740
741    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
742    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
743    ///
744    /// The parquet writer will attempt to limit the size of each
745    /// `DataPage` used to store dictionaries to this many
746    /// bytes. Reducing this value will result in larger parquet
747    /// files, but may improve the effectiveness of page index based
748    /// predicate pushdown during reading.
749    ///
750    /// Note: this is a best effort limit based on value of
751    /// [`set_write_batch_size`](Self::set_write_batch_size).
752    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
753        self.default_column_properties
754            .set_dictionary_page_size_limit(value);
755        self
756    }
757
758    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
759    /// [`DEFAULT_STATISTICS_ENABLED`]).
760    ///
761    /// [`Page`]: EnabledStatistics::Page
762    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
763        self.default_column_properties.set_statistics_enabled(value);
764        self
765    }
766
767    /// enable/disable writing [`Statistics`] in the page header
768    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
769    ///
770    /// Only applicable if [`Page`] level statistics are gathered.
771    ///
772    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
773    /// file while yielding very little added benefit. Most modern Parquet implementations
774    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
775    /// those in the page header.
776    ///
777    /// # Note
778    ///
779    /// Prior to version 56.0.0, the `parquet` crate always wrote these
780    /// statistics (the equivalent of setting this option to `true`). This was
781    /// changed in 56.0.0 to follow the recommendation in the Parquet
782    /// specification. See [issue #7580] for more details.
783    ///
784    /// [`Statistics`]: crate::file::statistics::Statistics
785    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
786    /// [`Page`]: EnabledStatistics::Page
787    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
788    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
789        self.default_column_properties
790            .set_write_page_header_statistics(value);
791        self
792    }
793
794    /// Sets if bloom filter should be written for all columns (defaults to `false`).
795    ///
796    /// # Notes
797    ///
798    /// * If the bloom filter is enabled previously then it is a no-op.
799    ///
800    /// * If the bloom filter is not enabled, default values for ndv and fpp
801    ///   value are used used. See [`set_bloom_filter_ndv`] and
802    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
803    ///
804    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
805    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
806    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
807        self.default_column_properties
808            .set_bloom_filter_enabled(value);
809        self
810    }
811
812    /// Sets the default target bloom filter false positive probability (fpp)
813    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
814    ///
815    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
816    /// been called.
817    ///
818    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
819    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
820        self.default_column_properties.set_bloom_filter_fpp(value);
821        self
822    }
823
824    /// Sets default number of distinct values (ndv) for bloom filter for all
825    /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
826    ///
827    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
828    /// been called.
829    ///
830    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
831    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
832        self.default_column_properties.set_bloom_filter_ndv(value);
833        self
834    }
835
836    // ----------------------------------------------------------------------
837    // Setters for a specific column
838
839    /// Helper method to get existing or new mutable reference of column properties.
840    #[inline]
841    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
842        self.column_properties.entry(col).or_default()
843    }
844
845    /// Sets encoding for a specific column.
846    ///
847    /// Takes precedence over [`Self::set_encoding`].
848    ///
849    /// If dictionary is not enabled, this is treated as a primary encoding for this
850    /// column. In case when dictionary is enabled for this column, either through
851    /// global defaults or explicitly, this value is considered to be a fallback
852    /// encoding for this column.
853    ///
854    /// # Panics
855    /// If user tries to set dictionary encoding here, regardless of dictionary
856    /// encoding flag being set.
857    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
858        self.get_mut_props(col).set_encoding(value);
859        self
860    }
861
862    /// Sets compression codec for a specific column.
863    ///
864    /// Takes precedence over [`Self::set_compression`].
865    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
866        self.get_mut_props(col).set_compression(value);
867        self
868    }
869
870    /// Sets flag to enable/disable dictionary encoding for a specific column.
871    ///
872    /// Takes precedence over [`Self::set_dictionary_enabled`].
873    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
874        self.get_mut_props(col).set_dictionary_enabled(value);
875        self
876    }
877
878    /// Sets dictionary page size limit for a specific column.
879    ///
880    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
881    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
882        self.get_mut_props(col)
883            .set_dictionary_page_size_limit(value);
884        self
885    }
886
887    /// Sets [`EnabledStatistics`] level for a specific column.
888    ///
889    /// Takes precedence over [`Self::set_statistics_enabled`].
890    pub fn set_column_statistics_enabled(
891        mut self,
892        col: ColumnPath,
893        value: EnabledStatistics,
894    ) -> Self {
895        self.get_mut_props(col).set_statistics_enabled(value);
896        self
897    }
898
899    /// Sets whether to write [`Statistics`] in the page header for a specific column.
900    ///
901    /// Takes precedence over [`Self::set_write_page_header_statistics`].
902    ///
903    /// [`Statistics`]: crate::file::statistics::Statistics
904    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
905        self.get_mut_props(col)
906            .set_write_page_header_statistics(value);
907        self
908    }
909
910    /// Sets whether a bloom filter should be written for a specific column.
911    ///
912    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
913    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
914        self.get_mut_props(col).set_bloom_filter_enabled(value);
915        self
916    }
917
918    /// Sets the false positive probability for bloom filter for a specific column.
919    ///
920    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
921    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
922        self.get_mut_props(col).set_bloom_filter_fpp(value);
923        self
924    }
925
926    /// Sets the number of distinct values for bloom filter for a specific column.
927    ///
928    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
929    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
930        self.get_mut_props(col).set_bloom_filter_ndv(value);
931        self
932    }
933}
934
935/// Controls the level of statistics to be computed by the writer and stored in
936/// the parquet file.
937///
938/// Enabling statistics makes the resulting Parquet file larger and requires
939/// more time to read the parquet footer.
940///
941/// Statistics can be used to improve query performance by pruning row groups
942/// and pages during query execution if the query engine supports evaluating the
943/// predicate using the statistics.
944#[derive(Debug, Clone, Copy, Eq, PartialEq)]
945pub enum EnabledStatistics {
946    /// Compute no statistics.
947    None,
948    /// Compute column chunk-level statistics but not page-level.
949    ///
950    /// Setting this option will store one set of statistics for each relevant
951    /// column for each row group. The more row groups written, the more
952    /// statistics will be stored.
953    Chunk,
954    /// Compute page-level and column chunk-level statistics.
955    ///
956    /// Setting this option will store one set of statistics for each relevant
957    /// column for each row group. In addition, this will enable the writing
958    /// of the column index (the offset index is always written regardless of
959    /// this setting). See [`ParquetColumnIndex`] for
960    /// more information.
961    ///
962    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
963    Page,
964}
965
966impl FromStr for EnabledStatistics {
967    type Err = String;
968
969    fn from_str(s: &str) -> Result<Self, Self::Err> {
970        match s {
971            "NONE" | "none" => Ok(EnabledStatistics::None),
972            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
973            "PAGE" | "page" => Ok(EnabledStatistics::Page),
974            _ => Err(format!("Invalid statistics arg: {s}")),
975        }
976    }
977}
978
979impl Default for EnabledStatistics {
980    fn default() -> Self {
981        DEFAULT_STATISTICS_ENABLED
982    }
983}
984
985/// Controls the bloom filter to be computed by the writer.
986#[derive(Debug, Clone, PartialEq)]
987pub struct BloomFilterProperties {
988    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
989    ///
990    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
991    ///
992    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
993    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
994    /// e.g. 0.1, 0.05, or 0.001 is recommended.
995    ///
996    /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
997    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
998    /// be known in advance to greatly reduce space usage.
999    pub fpp: f64,
1000    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1001    ///
1002    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1003    ///
1004    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1005    /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1006    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1007    /// anyway.
1008    ///
1009    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1010    pub ndv: u64,
1011}
1012
1013impl Default for BloomFilterProperties {
1014    fn default() -> Self {
1015        BloomFilterProperties {
1016            fpp: DEFAULT_BLOOM_FILTER_FPP,
1017            ndv: DEFAULT_BLOOM_FILTER_NDV,
1018        }
1019    }
1020}
1021
1022/// Container for column properties that can be changed as part of writer.
1023///
1024/// If a field is `None`, it means that no specific value has been set for this column,
1025/// so some subsequent or default value must be used.
1026#[derive(Debug, Clone, Default, PartialEq)]
1027struct ColumnProperties {
1028    encoding: Option<Encoding>,
1029    codec: Option<Compression>,
1030    dictionary_page_size_limit: Option<usize>,
1031    dictionary_enabled: Option<bool>,
1032    statistics_enabled: Option<EnabledStatistics>,
1033    write_page_header_statistics: Option<bool>,
1034    /// bloom filter related properties
1035    bloom_filter_properties: Option<BloomFilterProperties>,
1036}
1037
1038impl ColumnProperties {
1039    /// Sets encoding for this column.
1040    ///
1041    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1042    /// In case when dictionary is enabled for a column, this value is considered to
1043    /// be a fallback encoding.
1044    ///
1045    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1046    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1047    /// for a column.
1048    fn set_encoding(&mut self, value: Encoding) {
1049        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1050            panic!("Dictionary encoding can not be used as fallback encoding");
1051        }
1052        self.encoding = Some(value);
1053    }
1054
1055    /// Sets compression codec for this column.
1056    fn set_compression(&mut self, value: Compression) {
1057        self.codec = Some(value);
1058    }
1059
1060    /// Sets whether dictionary encoding is enabled for this column.
1061    fn set_dictionary_enabled(&mut self, enabled: bool) {
1062        self.dictionary_enabled = Some(enabled);
1063    }
1064
1065    /// Sets dictionary page size limit for this column.
1066    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1067        self.dictionary_page_size_limit = Some(value);
1068    }
1069
1070    /// Sets the statistics level for this column.
1071    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1072        self.statistics_enabled = Some(enabled);
1073    }
1074
1075    /// Sets whether to write statistics in the page header for this column.
1076    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1077        self.write_page_header_statistics = Some(enabled);
1078    }
1079
1080    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1081    /// otherwise it is a no-op.
1082    /// If `value` is `false`, resets bloom filter properties to `None`.
1083    fn set_bloom_filter_enabled(&mut self, value: bool) {
1084        if value && self.bloom_filter_properties.is_none() {
1085            self.bloom_filter_properties = Some(Default::default())
1086        } else if !value {
1087            self.bloom_filter_properties = None
1088        }
1089    }
1090
1091    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1092    /// bloom filter if not previously enabled.
1093    ///
1094    /// # Panics
1095    ///
1096    /// Panics if the `value` is not between 0 and 1 exclusive
1097    fn set_bloom_filter_fpp(&mut self, value: f64) {
1098        assert!(
1099            value > 0. && value < 1.0,
1100            "fpp must be between 0 and 1 exclusive, got {value}"
1101        );
1102
1103        self.bloom_filter_properties
1104            .get_or_insert_with(Default::default)
1105            .fpp = value;
1106    }
1107
1108    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1109    /// enables bloom filter if not previously enabled.
1110    fn set_bloom_filter_ndv(&mut self, value: u64) {
1111        self.bloom_filter_properties
1112            .get_or_insert_with(Default::default)
1113            .ndv = value;
1114    }
1115
1116    /// Returns optional encoding for this column.
1117    fn encoding(&self) -> Option<Encoding> {
1118        self.encoding
1119    }
1120
1121    /// Returns optional compression codec for this column.
1122    fn compression(&self) -> Option<Compression> {
1123        self.codec
1124    }
1125
1126    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1127    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1128    /// been provided.
1129    fn dictionary_enabled(&self) -> Option<bool> {
1130        self.dictionary_enabled
1131    }
1132
1133    /// Returns optional dictionary page size limit for this column.
1134    fn dictionary_page_size_limit(&self) -> Option<usize> {
1135        self.dictionary_page_size_limit
1136    }
1137
1138    /// Returns optional statistics level requested for this column. If result is `None`,
1139    /// then no setting has been provided.
1140    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1141        self.statistics_enabled
1142    }
1143
1144    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1145    /// column.
1146    ///
1147    /// [`Statistics`]: crate::file::statistics::Statistics
1148    fn write_page_header_statistics(&self) -> Option<bool> {
1149        self.write_page_header_statistics
1150    }
1151
1152    /// Returns the bloom filter properties, or `None` if not enabled
1153    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1154        self.bloom_filter_properties.as_ref()
1155    }
1156}
1157
1158/// Reference counted reader properties.
1159pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1160
1161const DEFAULT_READ_BLOOM_FILTER: bool = false;
1162
1163/// Configuration settings for reading parquet files.
1164///
1165/// All properties are immutable and `Send` + `Sync`.
1166/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1167///
1168/// # Example
1169///
1170/// ```rust
1171/// use parquet::file::properties::ReaderProperties;
1172///
1173/// // Create properties with default configuration.
1174/// let props = ReaderProperties::builder().build();
1175///
1176/// // Use properties builder to set certain options and assemble the configuration.
1177/// let props = ReaderProperties::builder()
1178///     .set_backward_compatible_lz4(false)
1179///     .build();
1180/// ```
1181pub struct ReaderProperties {
1182    codec_options: CodecOptions,
1183    read_bloom_filter: bool,
1184}
1185
1186impl ReaderProperties {
1187    /// Returns builder for reader properties with default values.
1188    pub fn builder() -> ReaderPropertiesBuilder {
1189        ReaderPropertiesBuilder::with_defaults()
1190    }
1191
1192    /// Returns codec options.
1193    pub(crate) fn codec_options(&self) -> &CodecOptions {
1194        &self.codec_options
1195    }
1196
1197    /// Returns whether to read bloom filter
1198    pub(crate) fn read_bloom_filter(&self) -> bool {
1199        self.read_bloom_filter
1200    }
1201}
1202
1203/// Builder for parquet file reader configuration. See example on
1204/// [`ReaderProperties`]
1205pub struct ReaderPropertiesBuilder {
1206    codec_options_builder: CodecOptionsBuilder,
1207    read_bloom_filter: Option<bool>,
1208}
1209
1210/// Reader properties builder.
1211impl ReaderPropertiesBuilder {
1212    /// Returns default state of the builder.
1213    fn with_defaults() -> Self {
1214        Self {
1215            codec_options_builder: CodecOptionsBuilder::default(),
1216            read_bloom_filter: None,
1217        }
1218    }
1219
1220    /// Finalizes the configuration and returns immutable reader properties struct.
1221    pub fn build(self) -> ReaderProperties {
1222        ReaderProperties {
1223            codec_options: self.codec_options_builder.build(),
1224            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1225        }
1226    }
1227
1228    /// Enable/disable backward compatible LZ4.
1229    ///
1230    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1231    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1232    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1233    /// compatibility with files generated by older versions of parquet-cpp.
1234    ///
1235    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1236    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1237        self.codec_options_builder = self
1238            .codec_options_builder
1239            .set_backward_compatible_lz4(value);
1240        self
1241    }
1242
1243    /// Enable/disable reading bloom filter
1244    ///
1245    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1246    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1247    ///
1248    /// By default bloom filter is set to be read.
1249    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1250        self.read_bloom_filter = Some(value);
1251        self
1252    }
1253}
1254
1255#[cfg(test)]
1256mod tests {
1257    use super::*;
1258
1259    #[test]
1260    fn test_writer_version() {
1261        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1262        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1263    }
1264
1265    #[test]
1266    fn test_writer_properties_default_settings() {
1267        let props = WriterProperties::default();
1268        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1269        assert_eq!(
1270            props.dictionary_page_size_limit(),
1271            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1272        );
1273        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1274        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1275        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1276        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1277        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1278        assert_eq!(props.key_value_metadata(), None);
1279        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1280        assert_eq!(
1281            props.compression(&ColumnPath::from("col")),
1282            DEFAULT_COMPRESSION
1283        );
1284        assert_eq!(
1285            props.dictionary_enabled(&ColumnPath::from("col")),
1286            DEFAULT_DICTIONARY_ENABLED
1287        );
1288        assert_eq!(
1289            props.statistics_enabled(&ColumnPath::from("col")),
1290            DEFAULT_STATISTICS_ENABLED
1291        );
1292        assert!(props
1293            .bloom_filter_properties(&ColumnPath::from("col"))
1294            .is_none());
1295    }
1296
1297    #[test]
1298    fn test_writer_properties_dictionary_encoding() {
1299        // dictionary encoding is not configurable, and it should be the same for both
1300        // writer version 1 and 2.
1301        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1302            let props = WriterProperties::builder()
1303                .set_writer_version(*version)
1304                .build();
1305            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1306            assert_eq!(
1307                props.dictionary_data_page_encoding(),
1308                Encoding::RLE_DICTIONARY
1309            );
1310        }
1311    }
1312
1313    #[test]
1314    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1315    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1316        // Should panic when user specifies dictionary encoding as fallback encoding.
1317        WriterProperties::builder()
1318            .set_encoding(Encoding::PLAIN_DICTIONARY)
1319            .build();
1320    }
1321
1322    #[test]
1323    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1324    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1325        // Should panic when user specifies dictionary encoding as fallback encoding.
1326        WriterProperties::builder()
1327            .set_encoding(Encoding::RLE_DICTIONARY)
1328            .build();
1329    }
1330
1331    #[test]
1332    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1333    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1334        WriterProperties::builder()
1335            .set_dictionary_enabled(true)
1336            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1337            .build();
1338    }
1339
1340    #[test]
1341    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1342    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1343        WriterProperties::builder()
1344            .set_dictionary_enabled(false)
1345            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1346            .build();
1347    }
1348
1349    #[test]
1350    fn test_writer_properties_builder() {
1351        let props = WriterProperties::builder()
1352            // file settings
1353            .set_writer_version(WriterVersion::PARQUET_2_0)
1354            .set_data_page_size_limit(10)
1355            .set_dictionary_page_size_limit(20)
1356            .set_write_batch_size(30)
1357            .set_max_row_group_size(40)
1358            .set_created_by("default".to_owned())
1359            .set_key_value_metadata(Some(vec![KeyValue::new(
1360                "key".to_string(),
1361                "value".to_string(),
1362            )]))
1363            // global column settings
1364            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1365            .set_compression(Compression::GZIP(Default::default()))
1366            .set_dictionary_enabled(false)
1367            .set_statistics_enabled(EnabledStatistics::None)
1368            // specific column settings
1369            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1370            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1371            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1372            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1373            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1374            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1375            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1376            .build();
1377
1378        assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1379        assert_eq!(props.data_page_size_limit(), 10);
1380        assert_eq!(props.dictionary_page_size_limit(), 20);
1381        assert_eq!(props.write_batch_size(), 30);
1382        assert_eq!(props.max_row_group_size(), 40);
1383        assert_eq!(props.created_by(), "default");
1384        assert_eq!(
1385            props.key_value_metadata(),
1386            Some(&vec![
1387                KeyValue::new("key".to_string(), "value".to_string(),)
1388            ])
1389        );
1390
1391        assert_eq!(
1392            props.encoding(&ColumnPath::from("a")),
1393            Some(Encoding::DELTA_BINARY_PACKED)
1394        );
1395        assert_eq!(
1396            props.compression(&ColumnPath::from("a")),
1397            Compression::GZIP(Default::default())
1398        );
1399        assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1400        assert_eq!(
1401            props.statistics_enabled(&ColumnPath::from("a")),
1402            EnabledStatistics::None
1403        );
1404
1405        assert_eq!(
1406            props.encoding(&ColumnPath::from("col")),
1407            Some(Encoding::RLE)
1408        );
1409        assert_eq!(
1410            props.compression(&ColumnPath::from("col")),
1411            Compression::SNAPPY
1412        );
1413        assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1414        assert_eq!(
1415            props.statistics_enabled(&ColumnPath::from("col")),
1416            EnabledStatistics::Chunk
1417        );
1418        assert_eq!(
1419            props.bloom_filter_properties(&ColumnPath::from("col")),
1420            Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1421        );
1422    }
1423
1424    #[test]
1425    fn test_writer_properties_builder_partial_defaults() {
1426        let props = WriterProperties::builder()
1427            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1428            .set_compression(Compression::GZIP(Default::default()))
1429            .set_bloom_filter_enabled(true)
1430            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1431            .build();
1432
1433        assert_eq!(
1434            props.encoding(&ColumnPath::from("col")),
1435            Some(Encoding::RLE)
1436        );
1437        assert_eq!(
1438            props.compression(&ColumnPath::from("col")),
1439            Compression::GZIP(Default::default())
1440        );
1441        assert_eq!(
1442            props.dictionary_enabled(&ColumnPath::from("col")),
1443            DEFAULT_DICTIONARY_ENABLED
1444        );
1445        assert_eq!(
1446            props.bloom_filter_properties(&ColumnPath::from("col")),
1447            Some(&BloomFilterProperties {
1448                fpp: 0.05,
1449                ndv: 1_000_000_u64
1450            })
1451        );
1452    }
1453
1454    #[test]
1455    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1456        assert_eq!(
1457            WriterProperties::builder()
1458                .build()
1459                .bloom_filter_properties(&ColumnPath::from("col")),
1460            None
1461        );
1462        assert_eq!(
1463            WriterProperties::builder()
1464                .set_bloom_filter_ndv(100)
1465                .build()
1466                .bloom_filter_properties(&ColumnPath::from("col")),
1467            Some(&BloomFilterProperties {
1468                fpp: 0.05,
1469                ndv: 100
1470            })
1471        );
1472        assert_eq!(
1473            WriterProperties::builder()
1474                .set_bloom_filter_fpp(0.1)
1475                .build()
1476                .bloom_filter_properties(&ColumnPath::from("col")),
1477            Some(&BloomFilterProperties {
1478                fpp: 0.1,
1479                ndv: 1_000_000_u64
1480            })
1481        );
1482    }
1483
1484    #[test]
1485    fn test_writer_properties_column_dictionary_page_size_limit() {
1486        let props = WriterProperties::builder()
1487            .set_dictionary_page_size_limit(100)
1488            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1489            .build();
1490
1491        assert_eq!(props.dictionary_page_size_limit(), 100);
1492        assert_eq!(
1493            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1494            10
1495        );
1496        assert_eq!(
1497            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1498            100
1499        );
1500    }
1501
1502    #[test]
1503    fn test_reader_properties_default_settings() {
1504        let props = ReaderProperties::builder().build();
1505
1506        let codec_options = CodecOptionsBuilder::default()
1507            .set_backward_compatible_lz4(true)
1508            .build();
1509
1510        assert_eq!(props.codec_options(), &codec_options);
1511        assert!(!props.read_bloom_filter());
1512    }
1513
1514    #[test]
1515    fn test_reader_properties_builder() {
1516        let props = ReaderProperties::builder()
1517            .set_backward_compatible_lz4(false)
1518            .build();
1519
1520        let codec_options = CodecOptionsBuilder::default()
1521            .set_backward_compatible_lz4(false)
1522            .build();
1523
1524        assert_eq!(props.codec_options(), &codec_options);
1525    }
1526
1527    #[test]
1528    fn test_parse_writerversion() {
1529        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1530        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1531        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1532        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1533
1534        // test lowercase
1535        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1536        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1537
1538        // test invalid version
1539        match "PARQUET_-1_0".parse::<WriterVersion>() {
1540            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1541            Err(e) => {
1542                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1543            }
1544        }
1545    }
1546
1547    #[test]
1548    fn test_parse_enabledstatistics() {
1549        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1550        assert_eq!(enabled_statistics, EnabledStatistics::None);
1551        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1552        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1553        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1554        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1555
1556        // test lowercase
1557        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1558        assert_eq!(enabled_statistics, EnabledStatistics::None);
1559
1560        //test invalid statistics
1561        match "ChunkAndPage".parse::<EnabledStatistics>() {
1562            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1563            Err(e) => {
1564                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1565            }
1566        }
1567    }
1568}
parquet/file/properties.rs

parquet/file/
properties.rs