Skip to main content

parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_row_count`]
47pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`].
57///
58/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
59/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
60/// but without an explicit NDV will have their NDV resolved at build time to
61/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
62/// if the user configured a custom row group size.
63pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
64/// Default values for [`WriterProperties::statistics_truncate_length`]
65pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
66/// Default value for [`WriterProperties::offset_index_disabled`]
67pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
68/// Default values for [`WriterProperties::coerce_types`]
69pub const DEFAULT_COERCE_TYPES: bool = false;
70/// Default minimum chunk size for content-defined chunking: 256 KiB.
71pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
72/// Default maximum chunk size for content-defined chunking: 1024 KiB.
73pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024;
74/// Default normalization level for content-defined chunking.
75pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0;
76
77/// EXPERIMENTAL: Options for content-defined chunking (CDC).
78///
79/// Content-defined chunking is an experimental feature that optimizes parquet
80/// files for content addressable storage (CAS) systems by writing data pages
81/// according to content-defined chunk boundaries. This allows for more
82/// efficient deduplication of data across files, hence more efficient network
83/// transfers and storage.
84///
85/// Each content-defined chunk is written as a separate parquet data page. The
86/// following options control the chunks' size and the chunking process. Note
87/// that the chunk size is calculated based on the logical value of the data,
88/// before any encoding or compression is applied.
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub struct CdcOptions {
91    /// Minimum chunk size in bytes, default is 256 KiB.
92    /// The rolling hash will not be updated until this size is reached for each chunk.
93    /// Note that all data sent through the hash function is counted towards the chunk
94    /// size, including definition and repetition levels if present.
95    pub min_chunk_size: usize,
96    /// Maximum chunk size in bytes, default is 1024 KiB.
97    /// The chunker will create a new chunk whenever the chunk size exceeds this value.
98    /// Note that the parquet writer has a related [`data_page_size_limit`] property that
99    /// controls the maximum size of a parquet data page after encoding. While setting
100    /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect
101    /// the chunking effectiveness, it results in more small parquet data pages.
102    ///
103    /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit
104    pub max_chunk_size: usize,
105    /// Number of bit adjustment to the gearhash mask in order to center the chunk size
106    /// around the average size more aggressively, default is 0.
107    /// Increasing the normalization level increases the probability of finding a chunk,
108    /// improving the deduplication ratio, but also increasing the number of small chunks
109    /// resulting in many small parquet data pages. The default value provides a good
110    /// balance between deduplication ratio and fragmentation.
111    /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
112    /// expense of fragmentation. Negative values can also be used to reduce the
113    /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
114    /// Note that values outside [-3, 3] are not recommended, prefer using the default
115    /// value of 0 for most use cases.
116    pub norm_level: i32,
117}
118
119impl Default for CdcOptions {
120    fn default() -> Self {
121        Self {
122            min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE,
123            max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE,
124            norm_level: DEFAULT_CDC_NORM_LEVEL,
125        }
126    }
127}
128
129/// Parquet writer version.
130///
131/// Basic constant, which is not part of the Thrift definition.
132#[derive(Debug, Clone, Copy, PartialEq, Eq)]
133#[allow(non_camel_case_types)]
134pub enum WriterVersion {
135    /// Parquet format version 1.0
136    PARQUET_1_0,
137    /// Parquet format version 2.0
138    PARQUET_2_0,
139}
140
141impl WriterVersion {
142    /// Returns writer version as `i32`.
143    pub fn as_num(&self) -> i32 {
144        match self {
145            WriterVersion::PARQUET_1_0 => 1,
146            WriterVersion::PARQUET_2_0 => 2,
147        }
148    }
149}
150
151impl FromStr for WriterVersion {
152    type Err = String;
153
154    fn from_str(s: &str) -> Result<Self, Self::Err> {
155        match s {
156            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
157            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
158            _ => Err(format!("Invalid writer version: {s}")),
159        }
160    }
161}
162
163/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
164/// write Bloom filters
165///
166/// Basic constant, which is not part of the Thrift definition.
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168pub enum BloomFilterPosition {
169    /// Write Bloom Filters of each row group right after the row group
170    ///
171    /// This saves memory by writing it as soon as it is computed, at the cost
172    /// of data locality for readers
173    AfterRowGroup,
174    /// Write Bloom Filters at the end of the file
175    ///
176    /// This allows better data locality for readers, at the cost of memory usage
177    /// for writers.
178    End,
179}
180
181/// Reference counted writer properties.
182pub type WriterPropertiesPtr = Arc<WriterProperties>;
183
184/// Resolved state of [`WriterPropertiesBuilder::set_offset_index_disabled`].
185///
186/// When a user disables offset indexes but page-level statistics are enabled,
187/// the setting is overridden (offset indexes remain enabled). This enum
188/// preserves the user's original intent so that a round-trip through
189/// `WriterPropertiesBuilder` does not lose it.
190#[derive(Debug, Clone, Copy, PartialEq, Eq)]
191enum OffsetIndexSetting {
192    /// Offset indexes are enabled (the default).
193    Enabled,
194    /// User disabled offset indexes and no page-level statistics override it.
195    Disabled,
196    /// User disabled offset indexes, but page-level statistics require them,
197    /// so they remain enabled.
198    DisabledOverridden,
199}
200
201/// Configuration settings for writing parquet files.
202///
203/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
204///
205/// # Example
206///
207/// ```rust
208/// # use parquet::{
209/// #    basic::{Compression, Encoding},
210/// #    file::properties::*,
211/// #    schema::types::ColumnPath,
212/// # };
213/// #
214/// // Create properties with default configuration.
215/// let props = WriterProperties::default();
216///
217/// // Use properties builder to set certain options and assemble the configuration.
218/// let props = WriterProperties::builder()
219///     .set_writer_version(WriterVersion::PARQUET_1_0)
220///     .set_encoding(Encoding::PLAIN)
221///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
222///     .set_compression(Compression::SNAPPY)
223///     .build();
224///
225/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
226/// assert_eq!(
227///     props.encoding(&ColumnPath::from("col1")),
228///     Some(Encoding::DELTA_BINARY_PACKED)
229/// );
230/// assert_eq!(
231///     props.encoding(&ColumnPath::from("col2")),
232///     Some(Encoding::PLAIN)
233/// );
234/// ```
235#[derive(Debug, Clone)]
236pub struct WriterProperties {
237    data_page_row_count_limit: usize,
238    write_batch_size: usize,
239    max_row_group_row_count: Option<usize>,
240    max_row_group_bytes: Option<usize>,
241    bloom_filter_position: BloomFilterPosition,
242    writer_version: WriterVersion,
243    created_by: String,
244    offset_index_setting: OffsetIndexSetting,
245    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
246    default_column_properties: ColumnProperties,
247    column_properties: HashMap<ColumnPath, ColumnProperties>,
248    sorting_columns: Option<Vec<SortingColumn>>,
249    column_index_truncate_length: Option<usize>,
250    statistics_truncate_length: Option<usize>,
251    coerce_types: bool,
252    content_defined_chunking: Option<CdcOptions>,
253    #[cfg(feature = "encryption")]
254    pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
255}
256
257impl Default for WriterProperties {
258    fn default() -> Self {
259        Self::builder().build()
260    }
261}
262
263impl WriterProperties {
264    /// Create a new [`WriterProperties`] with the default settings
265    ///
266    /// See [`WriterProperties::builder`] for customising settings
267    pub fn new() -> Self {
268        Self::default()
269    }
270
271    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
272    /// properties.
273    pub fn builder() -> WriterPropertiesBuilder {
274        WriterPropertiesBuilder::default()
275    }
276
277    /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
278    /// Used for mutating existing property settings
279    pub fn into_builder(self) -> WriterPropertiesBuilder {
280        self.into()
281    }
282
283    /// Returns data page size limit.
284    ///
285    /// Note: this is a best effort limit based on the write batch size
286    ///
287    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
288    pub fn data_page_size_limit(&self) -> usize {
289        self.default_column_properties
290            .data_page_size_limit()
291            .unwrap_or(DEFAULT_PAGE_SIZE)
292    }
293
294    /// Returns data page size limit for a specific column.
295    ///
296    /// Takes precedence over [`Self::data_page_size_limit`].
297    ///
298    /// Note: this is a best effort limit based on the write batch size.
299    pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
300        self.column_properties
301            .get(col)
302            .and_then(|c| c.data_page_size_limit())
303            .or_else(|| self.default_column_properties.data_page_size_limit())
304            .unwrap_or(DEFAULT_PAGE_SIZE)
305    }
306
307    /// Returns dictionary page size limit.
308    ///
309    /// Note: this is a best effort limit based on the write batch size
310    ///
311    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
312    pub fn dictionary_page_size_limit(&self) -> usize {
313        self.default_column_properties
314            .dictionary_page_size_limit()
315            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
316    }
317
318    /// Returns dictionary page size limit for a specific column.
319    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
320        self.column_properties
321            .get(col)
322            .and_then(|c| c.dictionary_page_size_limit())
323            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
324            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
325    }
326
327    /// Returns the maximum page row count
328    ///
329    /// Note: this is a best effort limit based on the write batch size
330    ///
331    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
332    pub fn data_page_row_count_limit(&self) -> usize {
333        self.data_page_row_count_limit
334    }
335
336    /// Returns configured batch size for writes.
337    ///
338    /// When writing a batch of data, this setting allows to split it internally into
339    /// smaller batches so we can better estimate the size of a page currently being
340    /// written.
341    ///
342    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
343    pub fn write_batch_size(&self) -> usize {
344        self.write_batch_size
345    }
346
347    /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
348    ///
349    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
350    #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
351    pub fn max_row_group_size(&self) -> usize {
352        self.max_row_group_row_count.unwrap_or(usize::MAX)
353    }
354
355    /// Returns maximum number of rows in a row group, or `None` if unlimited.
356    ///
357    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
358    pub fn max_row_group_row_count(&self) -> Option<usize> {
359        self.max_row_group_row_count
360    }
361
362    /// Returns maximum size of a row group in bytes, or `None` if unlimited.
363    ///
364    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
365    pub fn max_row_group_bytes(&self) -> Option<usize> {
366        self.max_row_group_bytes
367    }
368
369    /// Returns bloom filter position.
370    ///
371    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
372    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
373        self.bloom_filter_position
374    }
375
376    /// Returns configured writer version.
377    ///
378    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
379    pub fn writer_version(&self) -> WriterVersion {
380        self.writer_version
381    }
382
383    /// Returns `created_by` string.
384    ///
385    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
386    pub fn created_by(&self) -> &str {
387        &self.created_by
388    }
389
390    /// Returns `true` if offset index writing is disabled.
391    ///
392    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
393    pub fn offset_index_disabled(&self) -> bool {
394        matches!(self.offset_index_setting, OffsetIndexSetting::Disabled)
395    }
396
397    /// Returns `key_value_metadata` KeyValue pairs.
398    ///
399    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
400    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
401        self.key_value_metadata.as_ref()
402    }
403
404    /// Returns sorting columns.
405    ///
406    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
407    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
408        self.sorting_columns.as_ref()
409    }
410
411    /// Returns the maximum length of truncated min/max values in the column index.
412    ///
413    /// `None` if truncation is disabled, must be greater than 0 otherwise.
414    ///
415    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
416    pub fn column_index_truncate_length(&self) -> Option<usize> {
417        self.column_index_truncate_length
418    }
419
420    /// Returns the maximum length of truncated min/max values in [`Statistics`].
421    ///
422    /// `None` if truncation is disabled, must be greater than 0 otherwise.
423    ///
424    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
425    ///
426    /// [`Statistics`]: crate::file::statistics::Statistics
427    pub fn statistics_truncate_length(&self) -> Option<usize> {
428        self.statistics_truncate_length
429    }
430
431    /// Returns `true` if type coercion is enabled.
432    ///
433    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
434    pub fn coerce_types(&self) -> bool {
435        self.coerce_types
436    }
437
438    /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
439    ///
440    /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
441    pub fn content_defined_chunking(&self) -> Option<&CdcOptions> {
442        self.content_defined_chunking.as_ref()
443    }
444
445    /// Returns encoding for a data page, when dictionary encoding is enabled.
446    ///
447    /// This is not configurable.
448    #[inline]
449    pub fn dictionary_data_page_encoding(&self) -> Encoding {
450        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
451        // Dictionary values are encoded using RLE_DICTIONARY encoding.
452        Encoding::RLE_DICTIONARY
453    }
454
455    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
456    ///
457    /// This is not configurable.
458    #[inline]
459    pub fn dictionary_page_encoding(&self) -> Encoding {
460        // PLAIN_DICTIONARY is deprecated in writer version 1.
461        // Dictionary is encoded using plain encoding.
462        Encoding::PLAIN
463    }
464
465    /// Returns encoding for a column, if set.
466    ///
467    /// In case when dictionary is enabled, returns fallback encoding.
468    ///
469    /// If encoding is not set, then column writer will choose the best encoding
470    /// based on the column type.
471    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
472        self.column_properties
473            .get(col)
474            .and_then(|c| c.encoding())
475            .or_else(|| self.default_column_properties.encoding())
476    }
477
478    /// Returns compression codec for a column.
479    ///
480    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
481    pub fn compression(&self, col: &ColumnPath) -> Compression {
482        self.column_properties
483            .get(col)
484            .and_then(|c| c.compression())
485            .or_else(|| self.default_column_properties.compression())
486            .unwrap_or(DEFAULT_COMPRESSION)
487    }
488
489    /// Returns `true` if dictionary encoding is enabled for a column.
490    ///
491    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
492    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
493        self.column_properties
494            .get(col)
495            .and_then(|c| c.dictionary_enabled())
496            .or_else(|| self.default_column_properties.dictionary_enabled())
497            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
498    }
499
500    /// Returns which statistics are written for a column.
501    ///
502    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
503    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
504        self.column_properties
505            .get(col)
506            .and_then(|c| c.statistics_enabled())
507            .or_else(|| self.default_column_properties.statistics_enabled())
508            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
509    }
510
511    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
512    ///
513    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
514    ///
515    /// [`Statistics`]: crate::file::statistics::Statistics
516    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
517        self.column_properties
518            .get(col)
519            .and_then(|c| c.write_page_header_statistics())
520            .or_else(|| {
521                self.default_column_properties
522                    .write_page_header_statistics()
523            })
524            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
525    }
526
527    /// Returns the [`BloomFilterProperties`] for the given column
528    ///
529    /// Returns `None` if bloom filter is disabled
530    ///
531    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
532    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
533        self.column_properties
534            .get(col)
535            .and_then(|c| c.bloom_filter_properties())
536            .or_else(|| self.default_column_properties.bloom_filter_properties())
537    }
538
539    /// Return file encryption properties
540    ///
541    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
542    #[cfg(feature = "encryption")]
543    pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
544        self.file_encryption_properties.as_ref()
545    }
546}
547
548/// Builder for  [`WriterProperties`] Parquet writer configuration.
549///
550/// See example on [`WriterProperties`]
551#[derive(Debug, Clone)]
552pub struct WriterPropertiesBuilder {
553    data_page_row_count_limit: usize,
554    write_batch_size: usize,
555    max_row_group_row_count: Option<usize>,
556    max_row_group_bytes: Option<usize>,
557    bloom_filter_position: BloomFilterPosition,
558    writer_version: WriterVersion,
559    created_by: String,
560    offset_index_disabled: bool,
561    key_value_metadata: Option<Vec<KeyValue>>,
562    default_column_properties: ColumnProperties,
563    column_properties: HashMap<ColumnPath, ColumnProperties>,
564    sorting_columns: Option<Vec<SortingColumn>>,
565    column_index_truncate_length: Option<usize>,
566    statistics_truncate_length: Option<usize>,
567    coerce_types: bool,
568    content_defined_chunking: Option<CdcOptions>,
569    #[cfg(feature = "encryption")]
570    file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
571}
572
573impl Default for WriterPropertiesBuilder {
574    /// Returns default state of the builder.
575    fn default() -> Self {
576        Self {
577            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
578            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
579            max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
580            max_row_group_bytes: None,
581            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
582            writer_version: DEFAULT_WRITER_VERSION,
583            created_by: DEFAULT_CREATED_BY.to_string(),
584            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
585            key_value_metadata: None,
586            default_column_properties: Default::default(),
587            column_properties: HashMap::new(),
588            sorting_columns: None,
589            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
590            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
591            coerce_types: DEFAULT_COERCE_TYPES,
592            content_defined_chunking: None,
593            #[cfg(feature = "encryption")]
594            file_encryption_properties: None,
595        }
596    }
597}
598
599impl WriterPropertiesBuilder {
600    /// Finalizes the configuration and returns immutable writer properties struct.
601    pub fn build(self) -> WriterProperties {
602        // Pre-compute offset_index_setting
603        let offset_index_setting = if self.offset_index_disabled {
604            let default_page_stats_enabled = self.default_column_properties.statistics_enabled()
605                == Some(EnabledStatistics::Page);
606            let column_page_stats_enabled = self.column_properties.iter().any(|path_props| {
607                path_props.1.statistics_enabled() == Some(EnabledStatistics::Page)
608            });
609            if default_page_stats_enabled || column_page_stats_enabled {
610                OffsetIndexSetting::DisabledOverridden
611            } else {
612                OffsetIndexSetting::Disabled
613            }
614        } else {
615            OffsetIndexSetting::Enabled
616        };
617
618        // Resolve bloom filter NDV for columns where it wasn't explicitly set:
619        // default to max_row_group_row_count so the filter is never undersized.
620        let default_ndv = self
621            .max_row_group_row_count
622            .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
623        let mut default_column_properties = self.default_column_properties;
624        default_column_properties.resolve_bloom_filter_ndv(default_ndv);
625        let mut column_properties = self.column_properties;
626        for props in column_properties.values_mut() {
627            props.resolve_bloom_filter_ndv(default_ndv);
628        }
629
630        WriterProperties {
631            data_page_row_count_limit: self.data_page_row_count_limit,
632            write_batch_size: self.write_batch_size,
633            max_row_group_row_count: self.max_row_group_row_count,
634            max_row_group_bytes: self.max_row_group_bytes,
635            bloom_filter_position: self.bloom_filter_position,
636            writer_version: self.writer_version,
637            created_by: self.created_by,
638            offset_index_setting,
639            key_value_metadata: self.key_value_metadata,
640            default_column_properties,
641            column_properties,
642            sorting_columns: self.sorting_columns,
643            column_index_truncate_length: self.column_index_truncate_length,
644            statistics_truncate_length: self.statistics_truncate_length,
645            coerce_types: self.coerce_types,
646            content_defined_chunking: self.content_defined_chunking,
647            #[cfg(feature = "encryption")]
648            file_encryption_properties: self.file_encryption_properties,
649        }
650    }
651
652    // ----------------------------------------------------------------------
653    // Writer properties related to a file
654
655    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
656    /// via [`DEFAULT_WRITER_VERSION`])
657    ///
658    /// This value can determine what features some readers will support.
659    ///
660    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
661    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
662        self.writer_version = value;
663        self
664    }
665
666    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
667    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
668    ///
669    /// The parquet writer will attempt to limit the number of rows in
670    /// each `DataPage` to this value. Reducing this value will result
671    /// in larger parquet files, but may improve the effectiveness of
672    /// page index based predicate pushdown during reading.
673    ///
674    /// Note: this is a best effort limit based on value of
675    /// [`set_write_batch_size`](Self::set_write_batch_size).
676    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
677        self.data_page_row_count_limit = value;
678        self
679    }
680
681    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
682    ///
683    /// For performance reasons, data for each column is written in
684    /// batches of this size.
685    ///
686    /// Additional limits such as such as
687    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
688    /// are checked between batches, and thus the write batch size value acts as an
689    /// upper-bound on the enforcement granularity of other limits.
690    pub fn set_write_batch_size(mut self, value: usize) -> Self {
691        self.write_batch_size = value;
692        self
693    }
694
695    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
696    /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
697    ///
698    /// # Panics
699    /// If the value is set to 0.
700    #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
701    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
702        assert!(value > 0, "Cannot have a 0 max row group size");
703        self.max_row_group_row_count = Some(value);
704        self
705    }
706
707    /// Sets maximum number of rows in a row group, or `None` for unlimited.
708    ///
709    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
710    /// the row group with the smaller limit will be produced.
711    ///
712    /// # Panics
713    /// If the value is `Some(0)`.
714    pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
715        assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
716        self.max_row_group_row_count = value;
717        self
718    }
719
720    /// Sets maximum size of a row group in bytes, or `None` for unlimited.
721    ///
722    /// Row groups are flushed when their estimated encoded size exceeds this threshold.
723    /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
724    ///
725    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
726    /// the row group with the smaller limit will be produced.
727    ///
728    /// # Panics
729    /// If the value is `Some(0)`.
730    pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
731        assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
732        self.max_row_group_bytes = value;
733        self
734    }
735
736    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
737    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
738    ///
739    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
740    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
741        self.bloom_filter_position = value;
742        self
743    }
744
745    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
746    /// [`DEFAULT_CREATED_BY`]).
747    ///
748    /// This is a string that will be written into the file metadata
749    pub fn set_created_by(mut self, value: String) -> Self {
750        self.created_by = value;
751        self
752    }
753
754    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
755    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
756    ///
757    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
758    ///
759    /// Note: As the offset indexes are useful for accessing data by row number,
760    /// they are always written by default, regardless of whether other statistics
761    /// are enabled. Disabling this metadata may result in a degradation in read
762    /// performance, so use this option with care.
763    ///
764    /// [`Page`]: EnabledStatistics::Page
765    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
766        self.offset_index_disabled = value;
767        self
768    }
769
770    /// Sets "key_value_metadata" property (defaults to `None`).
771    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
772        self.key_value_metadata = value;
773        self
774    }
775
776    /// Sets sorting order of rows in the row group if any (defaults to `None`).
777    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
778        self.sorting_columns = value;
779        self
780    }
781
782    /// Sets the max length of min/max value fields when writing the column
783    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
784    ///
785    /// This can be used to prevent columns with very long values (hundreds of
786    /// bytes long) from causing the parquet metadata to become huge.
787    ///
788    /// # Notes
789    ///
790    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
791    /// set to [`EnabledStatistics::Page`].
792    ///
793    /// * If `Some`, must be greater than 0, otherwise will panic
794    /// * If `None`, there's no effective limit.
795    ///
796    /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
797    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
798        if let Some(value) = max_length {
799            assert!(
800                value > 0,
801                "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
802            );
803        }
804
805        self.column_index_truncate_length = max_length;
806        self
807    }
808
809    /// Sets the max length of min/max value fields in row group and data page header
810    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
811    ///
812    /// # Notes
813    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
814    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
815    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
816    /// [`EnabledStatistics::Page`].
817    ///
818    /// * If `Some`, must be greater than 0, otherwise will panic
819    /// * If `None`, there's no effective limit.
820    ///
821    /// # See also
822    /// Truncation of Page Index statistics is controlled separately via
823    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
824    ///
825    /// [`Statistics`]: crate::file::statistics::Statistics
826    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
827        if let Some(value) = max_length {
828            assert!(
829                value > 0,
830                "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
831            );
832        }
833
834        self.statistics_truncate_length = max_length;
835        self
836    }
837
838    /// Should the writer coerce types to parquet native types (defaults to `false` via
839    /// [`DEFAULT_COERCE_TYPES`]).
840    ///
841    /// Leaving this option the default `false` will ensure the exact same data
842    /// written to parquet using this library will be read.
843    ///
844    /// Setting this option to `true` will result in parquet files that can be
845    /// read by more readers, but potentially lose information in the process.
846    ///
847    /// * Types such as [`DataType::Date64`], which have no direct corresponding
848    ///   Parquet type, may be stored with lower precision.
849    ///
850    /// * The internal field names of `List` and `Map` types will be renamed if
851    ///   necessary to match what is required by the newest Parquet specification.
852    ///
853    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
854    ///
855    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
856    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
857    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
858        self.coerce_types = coerce_types;
859        self
860    }
861
862    /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
863    ///
864    /// When enabled, data page boundaries are determined by a rolling hash of the
865    /// column values, so unchanged data produces identical byte sequences across
866    /// file versions. This enables efficient deduplication on content-addressable
867    /// storage systems.
868    ///
869    /// Only supported through the Arrow writer interface ([`ArrowWriter`]).
870    ///
871    /// # Panics
872    ///
873    /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`.
874    ///
875    /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter
876    pub fn set_content_defined_chunking(mut self, options: Option<CdcOptions>) -> Self {
877        if let Some(ref options) = options {
878            assert!(
879                options.min_chunk_size > 0,
880                "min_chunk_size must be positive"
881            );
882            assert!(
883                options.max_chunk_size > options.min_chunk_size,
884                "max_chunk_size ({}) must be greater than min_chunk_size ({})",
885                options.max_chunk_size,
886                options.min_chunk_size
887            );
888        }
889        self.content_defined_chunking = options;
890        self
891    }
892
893    /// Sets FileEncryptionProperties (defaults to `None`)
894    #[cfg(feature = "encryption")]
895    pub fn with_file_encryption_properties(
896        mut self,
897        file_encryption_properties: Arc<FileEncryptionProperties>,
898    ) -> Self {
899        self.file_encryption_properties = Some(file_encryption_properties);
900        self
901    }
902
903    // ----------------------------------------------------------------------
904    // Setters for any column (global)
905
906    /// Sets default encoding for all columns.
907    ///
908    /// If dictionary is not enabled, this is treated as a primary encoding for all
909    /// columns. In case when dictionary is enabled for any column, this value is
910    /// considered to be a fallback encoding for that column.
911    ///
912    /// # Panics
913    ///
914    /// if dictionary encoding is specified, regardless of dictionary
915    /// encoding flag being set.
916    pub fn set_encoding(mut self, value: Encoding) -> Self {
917        self.default_column_properties.set_encoding(value);
918        self
919    }
920
921    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
922    /// [`DEFAULT_COMPRESSION`]).
923    ///
924    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
925    pub fn set_compression(mut self, value: Compression) -> Self {
926        self.default_column_properties.set_compression(value);
927        self
928    }
929
930    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
931    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
932    ///
933    /// Use this method to set dictionary encoding, instead of explicitly specifying
934    /// encoding in `set_encoding` method.
935    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
936        self.default_column_properties.set_dictionary_enabled(value);
937        self
938    }
939
940    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
941    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
942    ///
943    /// The parquet writer will attempt to limit the size of each
944    /// `DataPage` used to store dictionaries to this many
945    /// bytes. Reducing this value will result in larger parquet
946    /// files, but may improve the effectiveness of page index based
947    /// predicate pushdown during reading.
948    ///
949    /// Note: this is a best effort limit based on value of
950    /// [`set_write_batch_size`](Self::set_write_batch_size).
951    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
952        self.default_column_properties
953            .set_dictionary_page_size_limit(value);
954        self
955    }
956
957    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
958    /// via [`DEFAULT_PAGE_SIZE`]).
959    ///
960    /// The parquet writer will attempt to limit the sizes of each
961    /// `DataPage` to this many bytes. Reducing this value will result
962    /// in larger parquet files, but may improve the effectiveness of
963    /// page index based predicate pushdown during reading.
964    ///
965    /// Note: this is a best effort limit based on value of
966    /// [`set_write_batch_size`](Self::set_write_batch_size).
967    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
968        self.default_column_properties
969            .set_data_page_size_limit(value);
970        self
971    }
972
973    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
974    /// [`DEFAULT_STATISTICS_ENABLED`]).
975    ///
976    /// [`Page`]: EnabledStatistics::Page
977    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
978        self.default_column_properties.set_statistics_enabled(value);
979        self
980    }
981
982    /// enable/disable writing [`Statistics`] in the page header
983    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
984    ///
985    /// Only applicable if [`Page`] level statistics are gathered.
986    ///
987    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
988    /// file while yielding very little added benefit. Most modern Parquet implementations
989    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
990    /// those in the page header.
991    ///
992    /// # Note
993    ///
994    /// Prior to version 56.0.0, the `parquet` crate always wrote these
995    /// statistics (the equivalent of setting this option to `true`). This was
996    /// changed in 56.0.0 to follow the recommendation in the Parquet
997    /// specification. See [issue #7580] for more details.
998    ///
999    /// [`Statistics`]: crate::file::statistics::Statistics
1000    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1001    /// [`Page`]: EnabledStatistics::Page
1002    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
1003    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
1004        self.default_column_properties
1005            .set_write_page_header_statistics(value);
1006        self
1007    }
1008
1009    /// Sets if bloom filter should be written for all columns (defaults to `false`).
1010    ///
1011    /// # Notes
1012    ///
1013    /// * If the bloom filter is enabled previously then it is a no-op.
1014    ///
1015    /// * If the bloom filter is not enabled, default values for ndv and fpp
1016    ///   value are used used. See [`set_bloom_filter_ndv`] and
1017    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
1018    ///
1019    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
1020    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
1021    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
1022        self.default_column_properties
1023            .set_bloom_filter_enabled(value);
1024        self
1025    }
1026
1027    /// Sets the default target bloom filter false positive probability (fpp)
1028    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
1029    ///
1030    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1031    /// been called.
1032    ///
1033    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1034    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
1035        self.default_column_properties.set_bloom_filter_fpp(value);
1036        self
1037    }
1038
1039    /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1040    /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
1041    ///
1042    /// The bloom filter is initially sized for this many distinct values at the
1043    /// configured FPP, then folded down after all values are inserted to achieve
1044    /// optimal size. A good heuristic is to set this to the expected number of rows
1045    /// in the row group.
1046    ///
1047    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1048    /// been called.
1049    ///
1050    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1051    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
1052        self.default_column_properties.set_bloom_filter_ndv(value);
1053        self
1054    }
1055
1056    // ----------------------------------------------------------------------
1057    // Setters for a specific column
1058
1059    /// Helper method to get existing or new mutable reference of column properties.
1060    #[inline]
1061    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
1062        self.column_properties.entry(col).or_default()
1063    }
1064
1065    /// Sets encoding for a specific column.
1066    ///
1067    /// Takes precedence over [`Self::set_encoding`].
1068    ///
1069    /// If dictionary is not enabled, this is treated as a primary encoding for this
1070    /// column. In case when dictionary is enabled for this column, either through
1071    /// global defaults or explicitly, this value is considered to be a fallback
1072    /// encoding for this column.
1073    ///
1074    /// # Panics
1075    /// If user tries to set dictionary encoding here, regardless of dictionary
1076    /// encoding flag being set.
1077    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
1078        self.get_mut_props(col).set_encoding(value);
1079        self
1080    }
1081
1082    /// Sets compression codec for a specific column.
1083    ///
1084    /// Takes precedence over [`Self::set_compression`].
1085    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
1086        self.get_mut_props(col).set_compression(value);
1087        self
1088    }
1089
1090    /// Sets flag to enable/disable dictionary encoding for a specific column.
1091    ///
1092    /// Takes precedence over [`Self::set_dictionary_enabled`].
1093    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1094        self.get_mut_props(col).set_dictionary_enabled(value);
1095        self
1096    }
1097
1098    /// Sets dictionary page size limit for a specific column.
1099    ///
1100    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
1101    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1102        self.get_mut_props(col)
1103            .set_dictionary_page_size_limit(value);
1104        self
1105    }
1106
1107    /// Sets data page size limit for a specific column.
1108    ///
1109    /// Takes precedence over [`Self::set_data_page_size_limit`].
1110    pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1111        self.get_mut_props(col).set_data_page_size_limit(value);
1112        self
1113    }
1114
1115    /// Sets [`EnabledStatistics`] level for a specific column.
1116    ///
1117    /// Takes precedence over [`Self::set_statistics_enabled`].
1118    pub fn set_column_statistics_enabled(
1119        mut self,
1120        col: ColumnPath,
1121        value: EnabledStatistics,
1122    ) -> Self {
1123        self.get_mut_props(col).set_statistics_enabled(value);
1124        self
1125    }
1126
1127    /// Sets whether to write [`Statistics`] in the page header for a specific column.
1128    ///
1129    /// Takes precedence over [`Self::set_write_page_header_statistics`].
1130    ///
1131    /// [`Statistics`]: crate::file::statistics::Statistics
1132    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
1133        self.get_mut_props(col)
1134            .set_write_page_header_statistics(value);
1135        self
1136    }
1137
1138    /// Sets whether a bloom filter should be written for a specific column.
1139    ///
1140    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
1141    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1142        self.get_mut_props(col).set_bloom_filter_enabled(value);
1143        self
1144    }
1145
1146    /// Sets the false positive probability for bloom filter for a specific column.
1147    ///
1148    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1149    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1150        self.get_mut_props(col).set_bloom_filter_fpp(value);
1151        self
1152    }
1153
1154    /// Sets the number of distinct values for bloom filter for a specific column.
1155    ///
1156    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
1157    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1158        self.get_mut_props(col).set_bloom_filter_ndv(value);
1159        self
1160    }
1161}
1162
1163impl From<WriterProperties> for WriterPropertiesBuilder {
1164    fn from(props: WriterProperties) -> Self {
1165        WriterPropertiesBuilder {
1166            data_page_row_count_limit: props.data_page_row_count_limit,
1167            write_batch_size: props.write_batch_size,
1168            max_row_group_row_count: props.max_row_group_row_count,
1169            max_row_group_bytes: props.max_row_group_bytes,
1170            bloom_filter_position: props.bloom_filter_position,
1171            writer_version: props.writer_version,
1172            created_by: props.created_by,
1173            offset_index_disabled: !matches!(
1174                props.offset_index_setting,
1175                OffsetIndexSetting::Enabled
1176            ),
1177            key_value_metadata: props.key_value_metadata,
1178            default_column_properties: props.default_column_properties,
1179            column_properties: props.column_properties,
1180            sorting_columns: props.sorting_columns,
1181            column_index_truncate_length: props.column_index_truncate_length,
1182            statistics_truncate_length: props.statistics_truncate_length,
1183            coerce_types: props.coerce_types,
1184            content_defined_chunking: props.content_defined_chunking,
1185            #[cfg(feature = "encryption")]
1186            file_encryption_properties: props.file_encryption_properties,
1187        }
1188    }
1189}
1190
1191/// Controls the level of statistics to be computed by the writer and stored in
1192/// the parquet file.
1193///
1194/// Enabling statistics makes the resulting Parquet file larger and requires
1195/// more time to read the parquet footer.
1196///
1197/// Statistics can be used to improve query performance by pruning row groups
1198/// and pages during query execution if the query engine supports evaluating the
1199/// predicate using the statistics.
1200#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1201pub enum EnabledStatistics {
1202    /// Compute no statistics.
1203    None,
1204    /// Compute column chunk-level statistics but not page-level.
1205    ///
1206    /// Setting this option will store one set of statistics for each relevant
1207    /// column for each row group. The more row groups written, the more
1208    /// statistics will be stored.
1209    Chunk,
1210    /// Compute page-level and column chunk-level statistics.
1211    ///
1212    /// Setting this option will store one set of statistics for each relevant
1213    /// column for each row group. In addition, this will enable the writing
1214    /// of the column index (the offset index is always written regardless of
1215    /// this setting). See [`ParquetColumnIndex`] for
1216    /// more information.
1217    ///
1218    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1219    Page,
1220}
1221
1222impl FromStr for EnabledStatistics {
1223    type Err = String;
1224
1225    fn from_str(s: &str) -> Result<Self, Self::Err> {
1226        match s {
1227            "NONE" | "none" => Ok(EnabledStatistics::None),
1228            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1229            "PAGE" | "page" => Ok(EnabledStatistics::Page),
1230            _ => Err(format!("Invalid statistics arg: {s}")),
1231        }
1232    }
1233}
1234
1235impl Default for EnabledStatistics {
1236    fn default() -> Self {
1237        DEFAULT_STATISTICS_ENABLED
1238    }
1239}
1240
1241/// Controls the bloom filter to be computed by the writer.
1242///
1243/// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1244/// automatically folded down after all values are inserted to achieve optimal size while
1245/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1246/// folding algorithm.
1247///
1248/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
1249#[derive(Debug, Clone, PartialEq)]
1250pub struct BloomFilterProperties {
1251    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1252    ///
1253    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1254    ///
1255    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1256    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1257    /// e.g. 0.1, 0.05, or 0.001 is recommended.
1258    ///
1259    /// This value also serves as the target FPP for bloom filter folding: after all values
1260    /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
1261    pub fpp: f64,
1262    /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1263    ///
1264    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1265    ///
1266    /// When not explicitly set via the builder, this defaults to
1267    /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1268    /// build time). The bloom filter is initially sized for this many distinct values at the
1269    /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1270    /// is to set this to the expected number of rows in the row group. If fewer distinct values
1271    /// are actually written, the filter will be automatically compacted via folding.
1272    ///
1273    /// Thus the only negative side of overestimating this value is that the bloom filter
1274    /// will use more memory during writing than necessary, but it will not affect the final
1275    /// bloom filter size on disk.
1276    ///
1277    /// If you wish to reduce memory usage during writing and are able to make a reasonable estimate
1278    /// of the number of distinct values in a row group, it is recommended to set this value explicitly
1279    /// rather than relying on the default dynamic sizing based on `max_row_group_row_count`.
1280    /// If you do set this value explicitly it is probably best to set it for each column
1281    /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
1282    /// since different columns may have different numbers of distinct values.
1283    pub ndv: u64,
1284}
1285
1286impl Default for BloomFilterProperties {
1287    fn default() -> Self {
1288        BloomFilterProperties {
1289            fpp: DEFAULT_BLOOM_FILTER_FPP,
1290            ndv: DEFAULT_BLOOM_FILTER_NDV,
1291        }
1292    }
1293}
1294
1295/// Container for column properties that can be changed as part of writer.
1296///
1297/// If a field is `None`, it means that no specific value has been set for this column,
1298/// so some subsequent or default value must be used.
1299#[derive(Debug, Clone, Default, PartialEq)]
1300struct ColumnProperties {
1301    encoding: Option<Encoding>,
1302    codec: Option<Compression>,
1303    data_page_size_limit: Option<usize>,
1304    dictionary_page_size_limit: Option<usize>,
1305    dictionary_enabled: Option<bool>,
1306    statistics_enabled: Option<EnabledStatistics>,
1307    write_page_header_statistics: Option<bool>,
1308    /// bloom filter related properties
1309    bloom_filter_properties: Option<BloomFilterProperties>,
1310    /// Whether the bloom filter NDV was explicitly set by the user
1311    bloom_filter_ndv_is_set: bool,
1312}
1313
1314impl ColumnProperties {
1315    /// Sets encoding for this column.
1316    ///
1317    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1318    /// In case when dictionary is enabled for a column, this value is considered to
1319    /// be a fallback encoding.
1320    ///
1321    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1322    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1323    /// for a column.
1324    fn set_encoding(&mut self, value: Encoding) {
1325        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1326            panic!("Dictionary encoding can not be used as fallback encoding");
1327        }
1328        self.encoding = Some(value);
1329    }
1330
1331    /// Sets compression codec for this column.
1332    fn set_compression(&mut self, value: Compression) {
1333        self.codec = Some(value);
1334    }
1335
1336    /// Sets data page size limit for this column.
1337    fn set_data_page_size_limit(&mut self, value: usize) {
1338        self.data_page_size_limit = Some(value);
1339    }
1340
1341    /// Sets whether dictionary encoding is enabled for this column.
1342    fn set_dictionary_enabled(&mut self, enabled: bool) {
1343        self.dictionary_enabled = Some(enabled);
1344    }
1345
1346    /// Sets dictionary page size limit for this column.
1347    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1348        self.dictionary_page_size_limit = Some(value);
1349    }
1350
1351    /// Sets the statistics level for this column.
1352    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1353        self.statistics_enabled = Some(enabled);
1354    }
1355
1356    /// Sets whether to write statistics in the page header for this column.
1357    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1358        self.write_page_header_statistics = Some(enabled);
1359    }
1360
1361    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1362    /// otherwise it is a no-op.
1363    /// If `value` is `false`, resets bloom filter properties to `None`.
1364    fn set_bloom_filter_enabled(&mut self, value: bool) {
1365        if value && self.bloom_filter_properties.is_none() {
1366            self.bloom_filter_properties = Some(Default::default())
1367        } else if !value {
1368            self.bloom_filter_properties = None
1369        }
1370    }
1371
1372    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1373    /// bloom filter if not previously enabled.
1374    ///
1375    /// # Panics
1376    ///
1377    /// Panics if the `value` is not between 0 and 1 exclusive
1378    fn set_bloom_filter_fpp(&mut self, value: f64) {
1379        assert!(
1380            value > 0. && value < 1.0,
1381            "fpp must be between 0 and 1 exclusive, got {value}"
1382        );
1383
1384        self.bloom_filter_properties
1385            .get_or_insert_with(Default::default)
1386            .fpp = value;
1387    }
1388
1389    /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1390    /// column, and implicitly enables bloom filter if not previously enabled.
1391    fn set_bloom_filter_ndv(&mut self, value: u64) {
1392        self.bloom_filter_properties
1393            .get_or_insert_with(Default::default)
1394            .ndv = value;
1395        self.bloom_filter_ndv_is_set = true;
1396    }
1397
1398    /// Returns optional encoding for this column.
1399    fn encoding(&self) -> Option<Encoding> {
1400        self.encoding
1401    }
1402
1403    /// Returns optional compression codec for this column.
1404    fn compression(&self) -> Option<Compression> {
1405        self.codec
1406    }
1407
1408    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1409    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1410    /// been provided.
1411    fn dictionary_enabled(&self) -> Option<bool> {
1412        self.dictionary_enabled
1413    }
1414
1415    /// Returns optional dictionary page size limit for this column.
1416    fn dictionary_page_size_limit(&self) -> Option<usize> {
1417        self.dictionary_page_size_limit
1418    }
1419
1420    /// Returns optional data page size limit for this column.
1421    fn data_page_size_limit(&self) -> Option<usize> {
1422        self.data_page_size_limit
1423    }
1424
1425    /// Returns optional statistics level requested for this column. If result is `None`,
1426    /// then no setting has been provided.
1427    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1428        self.statistics_enabled
1429    }
1430
1431    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1432    /// column.
1433    ///
1434    /// [`Statistics`]: crate::file::statistics::Statistics
1435    fn write_page_header_statistics(&self) -> Option<bool> {
1436        self.write_page_header_statistics
1437    }
1438
1439    /// Returns the bloom filter properties, or `None` if not enabled
1440    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1441        self.bloom_filter_properties.as_ref()
1442    }
1443
1444    /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1445    /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1446    fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1447        if !self.bloom_filter_ndv_is_set {
1448            if let Some(ref mut bf) = self.bloom_filter_properties {
1449                bf.ndv = default_ndv;
1450            }
1451        }
1452    }
1453}
1454
1455/// Reference counted reader properties.
1456pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1457
1458const DEFAULT_READ_BLOOM_FILTER: bool = false;
1459const DEFAULT_READ_PAGE_STATS: bool = false;
1460
1461/// Configuration settings for reading parquet files.
1462///
1463/// All properties are immutable and `Send` + `Sync`.
1464/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1465///
1466/// # Example
1467///
1468/// ```rust
1469/// use parquet::file::properties::ReaderProperties;
1470///
1471/// // Create properties with default configuration.
1472/// let props = ReaderProperties::builder().build();
1473///
1474/// // Use properties builder to set certain options and assemble the configuration.
1475/// let props = ReaderProperties::builder()
1476///     .set_backward_compatible_lz4(false)
1477///     .build();
1478/// ```
1479pub struct ReaderProperties {
1480    codec_options: CodecOptions,
1481    read_bloom_filter: bool,
1482    read_page_stats: bool,
1483}
1484
1485impl ReaderProperties {
1486    /// Returns builder for reader properties with default values.
1487    pub fn builder() -> ReaderPropertiesBuilder {
1488        ReaderPropertiesBuilder::with_defaults()
1489    }
1490
1491    /// Returns codec options.
1492    pub(crate) fn codec_options(&self) -> &CodecOptions {
1493        &self.codec_options
1494    }
1495
1496    /// Returns whether to read bloom filter
1497    pub(crate) fn read_bloom_filter(&self) -> bool {
1498        self.read_bloom_filter
1499    }
1500
1501    /// Returns whether to read page level statistics
1502    pub(crate) fn read_page_stats(&self) -> bool {
1503        self.read_page_stats
1504    }
1505}
1506
1507/// Builder for parquet file reader configuration. See example on
1508/// [`ReaderProperties`]
1509pub struct ReaderPropertiesBuilder {
1510    codec_options_builder: CodecOptionsBuilder,
1511    read_bloom_filter: Option<bool>,
1512    read_page_stats: Option<bool>,
1513}
1514
1515/// Reader properties builder.
1516impl ReaderPropertiesBuilder {
1517    /// Returns default state of the builder.
1518    fn with_defaults() -> Self {
1519        Self {
1520            codec_options_builder: CodecOptionsBuilder::default(),
1521            read_bloom_filter: None,
1522            read_page_stats: None,
1523        }
1524    }
1525
1526    /// Finalizes the configuration and returns immutable reader properties struct.
1527    pub fn build(self) -> ReaderProperties {
1528        ReaderProperties {
1529            codec_options: self.codec_options_builder.build(),
1530            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1531            read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1532        }
1533    }
1534
1535    /// Enable/disable backward compatible LZ4.
1536    ///
1537    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1538    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1539    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1540    /// compatibility with files generated by older versions of parquet-cpp.
1541    ///
1542    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1543    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1544        self.codec_options_builder = self
1545            .codec_options_builder
1546            .set_backward_compatible_lz4(value);
1547        self
1548    }
1549
1550    /// Enable/disable reading bloom filter
1551    ///
1552    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1553    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1554    ///
1555    /// By default bloom filter is set to be read.
1556    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1557        self.read_bloom_filter = Some(value);
1558        self
1559    }
1560
1561    /// Enable/disable reading page-level statistics
1562    ///
1563    /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1564    /// each page, if present.
1565    /// If set to `false`, then the reader will skip decoding the statistics.
1566    ///
1567    /// By default statistics will not be decoded.
1568    ///
1569    /// [`Statistics`]: crate::file::statistics::Statistics
1570    pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1571        self.read_page_stats = Some(value);
1572        self
1573    }
1574}
1575
1576#[cfg(test)]
1577mod tests {
1578    use super::*;
1579
1580    #[test]
1581    fn test_writer_version() {
1582        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1583        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1584    }
1585
1586    #[test]
1587    fn test_writer_properties_default_settings() {
1588        let props = WriterProperties::default();
1589        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1590        assert_eq!(
1591            props.dictionary_page_size_limit(),
1592            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1593        );
1594        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1595        assert_eq!(
1596            props.max_row_group_row_count(),
1597            Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1598        );
1599        assert_eq!(props.max_row_group_bytes(), None);
1600        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1601        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1602        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1603        assert_eq!(props.key_value_metadata(), None);
1604        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1605        assert_eq!(
1606            props.compression(&ColumnPath::from("col")),
1607            DEFAULT_COMPRESSION
1608        );
1609        assert_eq!(
1610            props.dictionary_enabled(&ColumnPath::from("col")),
1611            DEFAULT_DICTIONARY_ENABLED
1612        );
1613        assert_eq!(
1614            props.statistics_enabled(&ColumnPath::from("col")),
1615            DEFAULT_STATISTICS_ENABLED
1616        );
1617        assert!(
1618            props
1619                .bloom_filter_properties(&ColumnPath::from("col"))
1620                .is_none()
1621        );
1622    }
1623
1624    #[test]
1625    fn test_writer_properties_dictionary_encoding() {
1626        // dictionary encoding is not configurable, and it should be the same for both
1627        // writer version 1 and 2.
1628        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1629            let props = WriterProperties::builder()
1630                .set_writer_version(*version)
1631                .build();
1632            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1633            assert_eq!(
1634                props.dictionary_data_page_encoding(),
1635                Encoding::RLE_DICTIONARY
1636            );
1637        }
1638    }
1639
1640    #[test]
1641    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1642    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1643        // Should panic when user specifies dictionary encoding as fallback encoding.
1644        WriterProperties::builder()
1645            .set_encoding(Encoding::PLAIN_DICTIONARY)
1646            .build();
1647    }
1648
1649    #[test]
1650    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1651    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1652        // Should panic when user specifies dictionary encoding as fallback encoding.
1653        WriterProperties::builder()
1654            .set_encoding(Encoding::RLE_DICTIONARY)
1655            .build();
1656    }
1657
1658    #[test]
1659    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1660    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1661        WriterProperties::builder()
1662            .set_dictionary_enabled(true)
1663            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1664            .build();
1665    }
1666
1667    #[test]
1668    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1669    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1670        WriterProperties::builder()
1671            .set_dictionary_enabled(false)
1672            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1673            .build();
1674    }
1675
1676    #[test]
1677    fn test_writer_properties_builder() {
1678        let props = WriterProperties::builder()
1679            // file settings
1680            .set_writer_version(WriterVersion::PARQUET_2_0)
1681            .set_data_page_size_limit(10)
1682            .set_dictionary_page_size_limit(20)
1683            .set_write_batch_size(30)
1684            .set_max_row_group_row_count(Some(40))
1685            .set_created_by("default".to_owned())
1686            .set_key_value_metadata(Some(vec![KeyValue::new(
1687                "key".to_string(),
1688                "value".to_string(),
1689            )]))
1690            // global column settings
1691            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1692            .set_compression(Compression::GZIP(Default::default()))
1693            .set_dictionary_enabled(false)
1694            .set_statistics_enabled(EnabledStatistics::None)
1695            // specific column settings
1696            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1697            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1698            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1699            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1700            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1701            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1702            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1703            .build();
1704
1705        fn test_props(props: &WriterProperties) {
1706            assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1707            assert_eq!(props.data_page_size_limit(), 10);
1708            assert_eq!(props.dictionary_page_size_limit(), 20);
1709            assert_eq!(props.write_batch_size(), 30);
1710            assert_eq!(props.max_row_group_row_count(), Some(40));
1711            assert_eq!(props.created_by(), "default");
1712            assert_eq!(
1713                props.key_value_metadata(),
1714                Some(&vec![
1715                    KeyValue::new("key".to_string(), "value".to_string(),)
1716                ])
1717            );
1718
1719            assert_eq!(
1720                props.encoding(&ColumnPath::from("a")),
1721                Some(Encoding::DELTA_BINARY_PACKED)
1722            );
1723            assert_eq!(
1724                props.compression(&ColumnPath::from("a")),
1725                Compression::GZIP(Default::default())
1726            );
1727            assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1728            assert_eq!(
1729                props.statistics_enabled(&ColumnPath::from("a")),
1730                EnabledStatistics::None
1731            );
1732
1733            assert_eq!(
1734                props.encoding(&ColumnPath::from("col")),
1735                Some(Encoding::RLE)
1736            );
1737            assert_eq!(
1738                props.compression(&ColumnPath::from("col")),
1739                Compression::SNAPPY
1740            );
1741            assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1742            assert_eq!(
1743                props.statistics_enabled(&ColumnPath::from("col")),
1744                EnabledStatistics::Chunk
1745            );
1746            assert_eq!(
1747                props.bloom_filter_properties(&ColumnPath::from("col")),
1748                Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1749            );
1750        }
1751
1752        // Test direct build of properties
1753        test_props(&props);
1754
1755        // Test that into_builder() gives the same result
1756        let props_into_builder_and_back = props.into_builder().build();
1757        test_props(&props_into_builder_and_back);
1758    }
1759
1760    #[test]
1761    fn test_writer_properties_builder_partial_defaults() {
1762        let props = WriterProperties::builder()
1763            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1764            .set_compression(Compression::GZIP(Default::default()))
1765            .set_bloom_filter_enabled(true)
1766            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1767            .build();
1768
1769        assert_eq!(
1770            props.encoding(&ColumnPath::from("col")),
1771            Some(Encoding::RLE)
1772        );
1773        assert_eq!(
1774            props.compression(&ColumnPath::from("col")),
1775            Compression::GZIP(Default::default())
1776        );
1777        assert_eq!(
1778            props.dictionary_enabled(&ColumnPath::from("col")),
1779            DEFAULT_DICTIONARY_ENABLED
1780        );
1781        assert_eq!(
1782            props.bloom_filter_properties(&ColumnPath::from("col")),
1783            Some(&BloomFilterProperties {
1784                fpp: DEFAULT_BLOOM_FILTER_FPP,
1785                ndv: DEFAULT_BLOOM_FILTER_NDV,
1786            })
1787        );
1788    }
1789
1790    #[test]
1791    #[allow(deprecated)]
1792    fn test_writer_properties_deprecated_max_row_group_size_still_works() {
1793        let props = WriterProperties::builder()
1794            .set_max_row_group_size(42)
1795            .build();
1796
1797        assert_eq!(props.max_row_group_row_count(), Some(42));
1798        assert_eq!(props.max_row_group_size(), 42);
1799    }
1800
1801    #[test]
1802    #[should_panic(expected = "Cannot have a 0 max row group row count")]
1803    fn test_writer_properties_panic_on_zero_row_group_row_count() {
1804        let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
1805    }
1806
1807    #[test]
1808    #[should_panic(expected = "Cannot have a 0 max row group bytes")]
1809    fn test_writer_properties_panic_on_zero_row_group_bytes() {
1810        let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
1811    }
1812
1813    #[test]
1814    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1815        assert_eq!(
1816            WriterProperties::builder()
1817                .build()
1818                .bloom_filter_properties(&ColumnPath::from("col")),
1819            None
1820        );
1821        assert_eq!(
1822            WriterProperties::builder()
1823                .set_bloom_filter_ndv(100)
1824                .build()
1825                .bloom_filter_properties(&ColumnPath::from("col")),
1826            Some(&BloomFilterProperties {
1827                fpp: DEFAULT_BLOOM_FILTER_FPP,
1828                ndv: 100,
1829            })
1830        );
1831        assert_eq!(
1832            WriterProperties::builder()
1833                .set_bloom_filter_fpp(0.1)
1834                .build()
1835                .bloom_filter_properties(&ColumnPath::from("col")),
1836            Some(&BloomFilterProperties {
1837                fpp: 0.1,
1838                ndv: DEFAULT_BLOOM_FILTER_NDV,
1839            })
1840        );
1841    }
1842
1843    #[test]
1844    fn test_writer_properties_column_dictionary_page_size_limit() {
1845        let props = WriterProperties::builder()
1846            .set_dictionary_page_size_limit(100)
1847            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1848            .build();
1849
1850        assert_eq!(props.dictionary_page_size_limit(), 100);
1851        assert_eq!(
1852            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1853            10
1854        );
1855        assert_eq!(
1856            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1857            100
1858        );
1859    }
1860
1861    #[test]
1862    fn test_writer_properties_column_data_page_size_limit() {
1863        let props = WriterProperties::builder()
1864            .set_data_page_size_limit(100)
1865            .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
1866            .build();
1867
1868        assert_eq!(props.data_page_size_limit(), 100);
1869        assert_eq!(
1870            props.column_data_page_size_limit(&ColumnPath::from("col")),
1871            10
1872        );
1873        assert_eq!(
1874            props.column_data_page_size_limit(&ColumnPath::from("other")),
1875            100
1876        );
1877    }
1878
1879    #[test]
1880    fn test_reader_properties_default_settings() {
1881        let props = ReaderProperties::builder().build();
1882
1883        let codec_options = CodecOptionsBuilder::default()
1884            .set_backward_compatible_lz4(true)
1885            .build();
1886
1887        assert_eq!(props.codec_options(), &codec_options);
1888        assert!(!props.read_bloom_filter());
1889    }
1890
1891    #[test]
1892    fn test_reader_properties_builder() {
1893        let props = ReaderProperties::builder()
1894            .set_backward_compatible_lz4(false)
1895            .build();
1896
1897        let codec_options = CodecOptionsBuilder::default()
1898            .set_backward_compatible_lz4(false)
1899            .build();
1900
1901        assert_eq!(props.codec_options(), &codec_options);
1902    }
1903
1904    #[test]
1905    fn test_parse_writerversion() {
1906        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1907        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1908        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1909        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1910
1911        // test lowercase
1912        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1913        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1914
1915        // test invalid version
1916        match "PARQUET_-1_0".parse::<WriterVersion>() {
1917            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1918            Err(e) => {
1919                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1920            }
1921        }
1922    }
1923
1924    #[test]
1925    fn test_parse_enabledstatistics() {
1926        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1927        assert_eq!(enabled_statistics, EnabledStatistics::None);
1928        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1929        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1930        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1931        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1932
1933        // test lowercase
1934        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1935        assert_eq!(enabled_statistics, EnabledStatistics::None);
1936
1937        //test invalid statistics
1938        match "ChunkAndPage".parse::<EnabledStatistics>() {
1939            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1940            Err(e) => {
1941                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1942            }
1943        }
1944    }
1945
1946    #[test]
1947    fn test_cdc_options_equality() {
1948        let opts = CdcOptions::default();
1949        assert_eq!(opts, CdcOptions::default());
1950
1951        let custom = CdcOptions {
1952            min_chunk_size: 1024,
1953            max_chunk_size: 8192,
1954            norm_level: 1,
1955        };
1956        assert_eq!(custom, custom);
1957        assert_ne!(opts, custom);
1958    }
1959}