Skip to main content

parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::errors::{ParquetError, Result};
24use crate::file::metadata::{KeyValue, SortingColumn};
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::write_page_header_statistics`]
46pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
47/// Default value for [`WriterProperties::max_row_group_row_count`]
48pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
49/// Default value for [`WriterProperties::bloom_filter_position`]
50pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
51/// Default value for [`WriterProperties::created_by`]
52pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
53/// Default value for [`WriterProperties::column_index_truncate_length`]
54pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
55/// Default value for [`BloomFilterProperties::fpp()`]
56pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
57/// Default value for [`BloomFilterProperties::ndv()`].
58///
59/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
60/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
61/// but without an explicit NDV will have their NDV resolved at build time to
62/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
63/// if the user configured a custom row group size.
64pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
65/// Default values for [`WriterProperties::statistics_truncate_length`]
66pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
67/// Default value for [`WriterProperties::offset_index_disabled`]
68pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
69/// Default values for [`WriterProperties::coerce_types`]
70pub const DEFAULT_COERCE_TYPES: bool = false;
71/// Default value for [`WriterProperties::data_page_v2_compression_ratio_threshold`]
72pub const DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD: f64 = 1.0;
73/// Default value for [`WriterProperties::write_path_in_schema`]
74pub const DEFAULT_WRITE_PATH_IN_SCHEMA: bool = true;
75/// Default minimum chunk size for content-defined chunking: 256 KiB.
76pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
77/// Default maximum chunk size for content-defined chunking: 1024 KiB.
78pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024;
79/// Default normalization level for content-defined chunking.
80pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0;
81
82/// EXPERIMENTAL: Options for content-defined chunking (CDC).
83///
84/// Content-defined chunking is an experimental feature that optimizes parquet
85/// files for content addressable storage (CAS) systems by writing data pages
86/// according to content-defined chunk boundaries. This allows for more
87/// efficient deduplication of data across files, hence more efficient network
88/// transfers and storage.
89///
90/// Each content-defined chunk is written as a separate parquet data page. The
91/// following options control the chunks' size and the chunking process. Note
92/// that the chunk size is calculated based on the logical value of the data,
93/// before any encoding or compression is applied.
94#[derive(Debug, Clone, Copy, PartialEq, Eq)]
95pub struct CdcOptions {
96    /// Minimum chunk size in bytes, default is 256 KiB.
97    /// The rolling hash will not be updated until this size is reached for each chunk.
98    /// Note that all data sent through the hash function is counted towards the chunk
99    /// size, including definition and repetition levels if present.
100    pub min_chunk_size: usize,
101    /// Maximum chunk size in bytes, default is 1024 KiB.
102    /// The chunker will create a new chunk whenever the chunk size exceeds this value.
103    /// Note that the parquet writer has a related [`data_page_size_limit`] property that
104    /// controls the maximum size of a parquet data page after encoding. While setting
105    /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect
106    /// the chunking effectiveness, it results in more small parquet data pages.
107    ///
108    /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit
109    pub max_chunk_size: usize,
110    /// Number of bit adjustment to the gearhash mask in order to center the chunk size
111    /// around the average size more aggressively, default is 0.
112    /// Increasing the normalization level increases the probability of finding a chunk,
113    /// improving the deduplication ratio, but also increasing the number of small chunks
114    /// resulting in many small parquet data pages. The default value provides a good
115    /// balance between deduplication ratio and fragmentation.
116    /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
117    /// expense of fragmentation. Negative values can also be used to reduce the
118    /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
119    /// Note that values outside [-3, 3] are not recommended, prefer using the default
120    /// value of 0 for most use cases.
121    pub norm_level: i32,
122}
123
124impl Default for CdcOptions {
125    fn default() -> Self {
126        Self {
127            min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE,
128            max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE,
129            norm_level: DEFAULT_CDC_NORM_LEVEL,
130        }
131    }
132}
133
134/// Parquet writer version.
135///
136/// Basic constant, which is not part of the Thrift definition.
137#[derive(Debug, Clone, Copy, PartialEq, Eq)]
138#[allow(non_camel_case_types)]
139pub enum WriterVersion {
140    /// Parquet format version 1.0
141    PARQUET_1_0,
142    /// Parquet format version 2.0
143    PARQUET_2_0,
144}
145
146impl WriterVersion {
147    /// Returns writer version as `i32`.
148    pub fn as_num(&self) -> i32 {
149        match self {
150            WriterVersion::PARQUET_1_0 => 1,
151            WriterVersion::PARQUET_2_0 => 2,
152        }
153    }
154}
155
156impl FromStr for WriterVersion {
157    type Err = String;
158
159    fn from_str(s: &str) -> Result<Self, Self::Err> {
160        match s {
161            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
162            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
163            _ => Err(format!("Invalid writer version: {s}")),
164        }
165    }
166}
167
168/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
169/// write Bloom filters
170///
171/// Basic constant, which is not part of the Thrift definition.
172#[derive(Debug, Clone, Copy, PartialEq, Eq)]
173pub enum BloomFilterPosition {
174    /// Write Bloom Filters of each row group right after the row group
175    ///
176    /// This saves memory by writing it as soon as it is computed, at the cost
177    /// of data locality for readers
178    AfterRowGroup,
179    /// Write Bloom Filters at the end of the file
180    ///
181    /// This allows better data locality for readers, at the cost of memory usage
182    /// for writers.
183    End,
184}
185
186/// Reference counted writer properties.
187pub type WriterPropertiesPtr = Arc<WriterProperties>;
188
189/// Resolved state of [`WriterPropertiesBuilder::set_offset_index_disabled`].
190///
191/// When a user disables offset indexes but page-level statistics are enabled,
192/// the setting is overridden (offset indexes remain enabled). This enum
193/// preserves the user's original intent so that a round-trip through
194/// `WriterPropertiesBuilder` does not lose it.
195#[derive(Debug, Clone, Copy, PartialEq, Eq)]
196enum OffsetIndexSetting {
197    /// Offset indexes are enabled (the default).
198    Enabled,
199    /// User disabled offset indexes and no page-level statistics override it.
200    Disabled,
201    /// User disabled offset indexes, but page-level statistics require them,
202    /// so they remain enabled.
203    DisabledOverridden,
204}
205
206/// Configuration settings for writing parquet files.
207///
208/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
209///
210/// # Example
211///
212/// ```rust
213/// # use parquet::{
214/// #    basic::{Compression, Encoding},
215/// #    file::properties::*,
216/// #    schema::types::ColumnPath,
217/// # };
218/// #
219/// // Create properties with default configuration.
220/// let props = WriterProperties::default();
221///
222/// // Use properties builder to set certain options and assemble the configuration.
223/// let props = WriterProperties::builder()
224///     .set_writer_version(WriterVersion::PARQUET_1_0)
225///     .set_encoding(Encoding::PLAIN)
226///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
227///     .set_compression(Compression::SNAPPY)
228///     .build();
229///
230/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
231/// assert_eq!(
232///     props.encoding(&ColumnPath::from("col1")),
233///     Some(Encoding::DELTA_BINARY_PACKED)
234/// );
235/// assert_eq!(
236///     props.encoding(&ColumnPath::from("col2")),
237///     Some(Encoding::PLAIN)
238/// );
239/// ```
240#[derive(Debug, Clone)]
241pub struct WriterProperties {
242    data_page_row_count_limit: usize,
243    write_batch_size: usize,
244    max_row_group_row_count: Option<usize>,
245    max_row_group_bytes: Option<usize>,
246    bloom_filter_position: BloomFilterPosition,
247    writer_version: WriterVersion,
248    created_by: String,
249    offset_index_setting: OffsetIndexSetting,
250    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
251    default_column_properties: ColumnProperties,
252    column_properties: HashMap<ColumnPath, ColumnProperties>,
253    sorting_columns: Option<Vec<SortingColumn>>,
254    column_index_truncate_length: Option<usize>,
255    statistics_truncate_length: Option<usize>,
256    coerce_types: bool,
257    content_defined_chunking: Option<CdcOptions>,
258    write_path_in_schema: bool,
259    #[cfg(feature = "encryption")]
260    pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
261}
262
263impl Default for WriterProperties {
264    fn default() -> Self {
265        Self::builder().build()
266    }
267}
268
269impl WriterProperties {
270    /// Create a new [`WriterProperties`] with the default settings
271    ///
272    /// See [`WriterProperties::builder`] for customising settings
273    pub fn new() -> Self {
274        Self::default()
275    }
276
277    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
278    /// properties.
279    pub fn builder() -> WriterPropertiesBuilder {
280        WriterPropertiesBuilder::default()
281    }
282
283    /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
284    /// Used for mutating existing property settings
285    pub fn into_builder(self) -> WriterPropertiesBuilder {
286        self.into()
287    }
288
289    /// Returns data page size limit.
290    ///
291    /// Note: this is a best effort limit based on the write batch size
292    ///
293    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
294    pub fn data_page_size_limit(&self) -> usize {
295        self.default_column_properties
296            .data_page_size_limit()
297            .unwrap_or(DEFAULT_PAGE_SIZE)
298    }
299
300    /// Returns data page size limit for a specific column.
301    ///
302    /// Takes precedence over [`Self::data_page_size_limit`].
303    ///
304    /// Note: this is a best effort limit based on the write batch size.
305    pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
306        self.column_properties
307            .get(col)
308            .and_then(|c| c.data_page_size_limit())
309            .or_else(|| self.default_column_properties.data_page_size_limit())
310            .unwrap_or(DEFAULT_PAGE_SIZE)
311    }
312
313    /// Returns dictionary page size limit.
314    ///
315    /// Note: this is a best effort limit based on the write batch size
316    ///
317    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
318    pub fn dictionary_page_size_limit(&self) -> usize {
319        self.default_column_properties
320            .dictionary_page_size_limit()
321            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
322    }
323
324    /// Returns dictionary page size limit for a specific column.
325    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
326        self.column_properties
327            .get(col)
328            .and_then(|c| c.dictionary_page_size_limit())
329            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
330            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
331    }
332
333    /// Returns the maximum page row count
334    ///
335    /// Note: this is a best effort limit based on the write batch size
336    ///
337    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
338    pub fn data_page_row_count_limit(&self) -> usize {
339        self.data_page_row_count_limit
340    }
341
342    /// Returns configured batch size for writes.
343    ///
344    /// When writing a batch of data, this setting allows to split it internally into
345    /// smaller batches so we can better estimate the size of a page currently being
346    /// written.
347    ///
348    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
349    pub fn write_batch_size(&self) -> usize {
350        self.write_batch_size
351    }
352
353    /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
354    ///
355    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
356    #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
357    pub fn max_row_group_size(&self) -> usize {
358        self.max_row_group_row_count.unwrap_or(usize::MAX)
359    }
360
361    /// Returns maximum number of rows in a row group, or `None` if unlimited.
362    ///
363    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
364    pub fn max_row_group_row_count(&self) -> Option<usize> {
365        self.max_row_group_row_count
366    }
367
368    /// Returns maximum size of a row group in bytes, or `None` if unlimited.
369    ///
370    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
371    pub fn max_row_group_bytes(&self) -> Option<usize> {
372        self.max_row_group_bytes
373    }
374
375    /// Returns bloom filter position.
376    ///
377    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
378    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
379        self.bloom_filter_position
380    }
381
382    /// Returns configured writer version.
383    ///
384    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
385    pub fn writer_version(&self) -> WriterVersion {
386        self.writer_version
387    }
388
389    /// Returns `created_by` string.
390    ///
391    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
392    pub fn created_by(&self) -> &str {
393        &self.created_by
394    }
395
396    /// Returns `true` if offset index writing is disabled.
397    ///
398    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
399    pub fn offset_index_disabled(&self) -> bool {
400        matches!(self.offset_index_setting, OffsetIndexSetting::Disabled)
401    }
402
403    /// Returns `key_value_metadata` KeyValue pairs.
404    ///
405    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
406    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
407        self.key_value_metadata.as_ref()
408    }
409
410    /// Returns sorting columns.
411    ///
412    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
413    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
414        self.sorting_columns.as_ref()
415    }
416
417    /// Returns the maximum length of truncated min/max values in the column index.
418    ///
419    /// `None` if truncation is disabled, must be greater than 0 otherwise.
420    ///
421    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
422    pub fn column_index_truncate_length(&self) -> Option<usize> {
423        self.column_index_truncate_length
424    }
425
426    /// Returns the maximum length of truncated min/max values in [`Statistics`].
427    ///
428    /// `None` if truncation is disabled, must be greater than 0 otherwise.
429    ///
430    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
431    ///
432    /// [`Statistics`]: crate::file::statistics::Statistics
433    pub fn statistics_truncate_length(&self) -> Option<usize> {
434        self.statistics_truncate_length
435    }
436
437    /// Returns `true` if type coercion is enabled.
438    ///
439    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
440    pub fn coerce_types(&self) -> bool {
441        self.coerce_types
442    }
443
444    /// Returns `true` if the `path_in_schema` field of the `ColumnMetaData` Thrift struct
445    /// should be written.
446    ///
447    /// For more details see [`WriterPropertiesBuilder::set_write_path_in_schema`]
448    pub fn write_path_in_schema(&self) -> bool {
449        self.write_path_in_schema
450    }
451
452    /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
453    ///
454    /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
455    pub fn content_defined_chunking(&self) -> Option<&CdcOptions> {
456        self.content_defined_chunking.as_ref()
457    }
458
459    /// Returns the compression ratio threshold at or above which a Data Page v2's
460    /// compressed values are discarded in favor of writing the values uncompressed.
461    ///
462    /// For more details see [`WriterPropertiesBuilder::set_data_page_v2_compression_ratio_threshold`]
463    pub fn data_page_v2_compression_ratio_threshold(&self) -> f64 {
464        self.default_column_properties
465            .data_page_v2_compression_ratio_threshold()
466            .unwrap_or(DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD)
467    }
468
469    /// Returns the Data Page v2 compression ratio threshold for a specific column.
470    ///
471    /// Takes precedence over [`Self::data_page_v2_compression_ratio_threshold`].
472    pub fn column_data_page_v2_compression_ratio_threshold(&self, col: &ColumnPath) -> f64 {
473        self.column_properties
474            .get(col)
475            .and_then(|c| c.data_page_v2_compression_ratio_threshold())
476            .or_else(|| {
477                self.default_column_properties
478                    .data_page_v2_compression_ratio_threshold()
479            })
480            .unwrap_or(DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD)
481    }
482
483    /// Returns encoding for a data page, when dictionary encoding is enabled.
484    ///
485    /// This is not configurable.
486    #[inline]
487    pub fn dictionary_data_page_encoding(&self) -> Encoding {
488        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
489        // Dictionary values are encoded using RLE_DICTIONARY encoding.
490        Encoding::RLE_DICTIONARY
491    }
492
493    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
494    ///
495    /// This is not configurable.
496    #[inline]
497    pub fn dictionary_page_encoding(&self) -> Encoding {
498        // PLAIN_DICTIONARY is deprecated in writer version 1.
499        // Dictionary is encoded using plain encoding.
500        Encoding::PLAIN
501    }
502
503    /// Returns encoding for a column, if set.
504    ///
505    /// In case when dictionary is enabled, returns fallback encoding.
506    ///
507    /// If encoding is not set, then column writer will choose the best encoding
508    /// based on the column type.
509    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
510        self.column_properties
511            .get(col)
512            .and_then(|c| c.encoding())
513            .or_else(|| self.default_column_properties.encoding())
514    }
515
516    /// Returns compression codec for a column.
517    ///
518    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
519    pub fn compression(&self, col: &ColumnPath) -> Compression {
520        self.column_properties
521            .get(col)
522            .and_then(|c| c.compression())
523            .or_else(|| self.default_column_properties.compression())
524            .unwrap_or(DEFAULT_COMPRESSION)
525    }
526
527    /// Returns `true` if dictionary encoding is enabled for a column.
528    ///
529    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
530    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
531        self.column_properties
532            .get(col)
533            .and_then(|c| c.dictionary_enabled())
534            .or_else(|| self.default_column_properties.dictionary_enabled())
535            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
536    }
537
538    /// Returns which statistics are written for a column.
539    ///
540    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
541    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
542        self.column_properties
543            .get(col)
544            .and_then(|c| c.statistics_enabled())
545            .or_else(|| self.default_column_properties.statistics_enabled())
546            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
547    }
548
549    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
550    ///
551    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
552    ///
553    /// [`Statistics`]: crate::file::statistics::Statistics
554    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
555        self.column_properties
556            .get(col)
557            .and_then(|c| c.write_page_header_statistics())
558            .or_else(|| {
559                self.default_column_properties
560                    .write_page_header_statistics()
561            })
562            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
563    }
564
565    /// Returns the [`BloomFilterProperties`] for the given column
566    ///
567    /// Returns `None` if bloom filter is disabled
568    ///
569    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
570    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
571        self.column_properties
572            .get(col)
573            .and_then(|c| c.bloom_filter_properties())
574            .or_else(|| self.default_column_properties.bloom_filter_properties())
575    }
576
577    /// Return file encryption properties
578    ///
579    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
580    #[cfg(feature = "encryption")]
581    pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
582        self.file_encryption_properties.as_ref()
583    }
584}
585
586/// Builder for  [`WriterProperties`] Parquet writer configuration.
587///
588/// See example on [`WriterProperties`]
589#[derive(Debug, Clone)]
590pub struct WriterPropertiesBuilder {
591    data_page_row_count_limit: usize,
592    write_batch_size: usize,
593    max_row_group_row_count: Option<usize>,
594    max_row_group_bytes: Option<usize>,
595    bloom_filter_position: BloomFilterPosition,
596    writer_version: WriterVersion,
597    created_by: String,
598    offset_index_disabled: bool,
599    key_value_metadata: Option<Vec<KeyValue>>,
600    default_column_properties: ColumnProperties,
601    column_properties: HashMap<ColumnPath, ColumnProperties>,
602    sorting_columns: Option<Vec<SortingColumn>>,
603    column_index_truncate_length: Option<usize>,
604    statistics_truncate_length: Option<usize>,
605    coerce_types: bool,
606    content_defined_chunking: Option<CdcOptions>,
607    write_path_in_schema: bool,
608    #[cfg(feature = "encryption")]
609    file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
610}
611
612impl Default for WriterPropertiesBuilder {
613    /// Returns default state of the builder.
614    fn default() -> Self {
615        Self {
616            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
617            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
618            max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
619            max_row_group_bytes: None,
620            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
621            writer_version: DEFAULT_WRITER_VERSION,
622            created_by: DEFAULT_CREATED_BY.to_string(),
623            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
624            key_value_metadata: None,
625            default_column_properties: Default::default(),
626            column_properties: HashMap::new(),
627            sorting_columns: None,
628            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
629            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
630            coerce_types: DEFAULT_COERCE_TYPES,
631            content_defined_chunking: None,
632            write_path_in_schema: DEFAULT_WRITE_PATH_IN_SCHEMA,
633            #[cfg(feature = "encryption")]
634            file_encryption_properties: None,
635        }
636    }
637}
638
639impl WriterPropertiesBuilder {
640    /// Finalizes the configuration and returns immutable writer properties struct.
641    pub fn build(self) -> WriterProperties {
642        // Pre-compute offset_index_setting
643        let offset_index_setting = if self.offset_index_disabled {
644            let default_page_stats_enabled = self.default_column_properties.statistics_enabled()
645                == Some(EnabledStatistics::Page);
646            let column_page_stats_enabled = self.column_properties.iter().any(|path_props| {
647                path_props.1.statistics_enabled() == Some(EnabledStatistics::Page)
648            });
649            if default_page_stats_enabled || column_page_stats_enabled {
650                OffsetIndexSetting::DisabledOverridden
651            } else {
652                OffsetIndexSetting::Disabled
653            }
654        } else {
655            OffsetIndexSetting::Enabled
656        };
657
658        // Resolve bloom filter NDV for columns where it wasn't explicitly set:
659        // default to max_row_group_row_count so the filter is never undersized.
660        let default_ndv = self
661            .max_row_group_row_count
662            .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
663        let mut default_column_properties = self.default_column_properties;
664        default_column_properties.resolve_bloom_filter_ndv(default_ndv);
665        let mut column_properties = self.column_properties;
666        for props in column_properties.values_mut() {
667            props.resolve_bloom_filter_ndv(default_ndv);
668        }
669
670        WriterProperties {
671            data_page_row_count_limit: self.data_page_row_count_limit,
672            write_batch_size: self.write_batch_size,
673            max_row_group_row_count: self.max_row_group_row_count,
674            max_row_group_bytes: self.max_row_group_bytes,
675            bloom_filter_position: self.bloom_filter_position,
676            writer_version: self.writer_version,
677            created_by: self.created_by,
678            offset_index_setting,
679            key_value_metadata: self.key_value_metadata,
680            default_column_properties,
681            column_properties,
682            sorting_columns: self.sorting_columns,
683            column_index_truncate_length: self.column_index_truncate_length,
684            statistics_truncate_length: self.statistics_truncate_length,
685            coerce_types: self.coerce_types,
686            content_defined_chunking: self.content_defined_chunking,
687            write_path_in_schema: self.write_path_in_schema,
688            #[cfg(feature = "encryption")]
689            file_encryption_properties: self.file_encryption_properties,
690        }
691    }
692
693    // ----------------------------------------------------------------------
694    // Writer properties related to a file
695
696    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
697    /// via [`DEFAULT_WRITER_VERSION`])
698    ///
699    /// This value can determine what features some readers will support.
700    ///
701    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
702    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
703        self.writer_version = value;
704        self
705    }
706
707    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
708    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
709    ///
710    /// The parquet writer will attempt to limit the number of rows in
711    /// each `DataPage` to this value. Reducing this value will result
712    /// in larger parquet files, but may improve the effectiveness of
713    /// page index based predicate pushdown during reading.
714    ///
715    /// Note: this is a best effort limit based on value of
716    /// [`set_write_batch_size`](Self::set_write_batch_size).
717    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
718        self.data_page_row_count_limit = value;
719        self
720    }
721
722    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
723    ///
724    /// For performance reasons, data for each column is written in
725    /// batches of this size.
726    ///
727    /// Additional limits such as such as
728    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
729    /// are checked between batches, and thus the write batch size value acts as an
730    /// upper-bound on the enforcement granularity of other limits.
731    pub fn set_write_batch_size(mut self, value: usize) -> Self {
732        self.write_batch_size = value;
733        self
734    }
735
736    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
737    /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
738    ///
739    /// # Panics
740    /// If the value is set to 0.
741    #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
742    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
743        assert!(value > 0, "Cannot have a 0 max row group size");
744        self.max_row_group_row_count = Some(value);
745        self
746    }
747
748    /// Sets maximum number of rows in a row group, or `None` for unlimited.
749    ///
750    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
751    /// the row group with the smaller limit will be produced.
752    ///
753    /// # Panics
754    /// If the value is `Some(0)`.
755    pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
756        assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
757        self.max_row_group_row_count = value;
758        self
759    }
760
761    /// Sets maximum size of a row group in bytes, or `None` for unlimited.
762    ///
763    /// Row groups are flushed when their estimated encoded size exceeds this threshold.
764    /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
765    ///
766    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
767    /// the row group with the smaller limit will be produced.
768    ///
769    /// # Panics
770    /// If the value is `Some(0)`.
771    pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
772        assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
773        self.max_row_group_bytes = value;
774        self
775    }
776
777    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
778    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
779    ///
780    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
781    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
782        self.bloom_filter_position = value;
783        self
784    }
785
786    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
787    /// [`DEFAULT_CREATED_BY`]).
788    ///
789    /// This is a string that will be written into the file metadata
790    pub fn set_created_by(mut self, value: String) -> Self {
791        self.created_by = value;
792        self
793    }
794
795    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
796    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
797    ///
798    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
799    ///
800    /// Note: As the offset indexes are useful for accessing data by row number,
801    /// they are always written by default, regardless of whether other statistics
802    /// are enabled. Disabling this metadata may result in a degradation in read
803    /// performance, so use this option with care.
804    ///
805    /// [`Page`]: EnabledStatistics::Page
806    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
807        self.offset_index_disabled = value;
808        self
809    }
810
811    /// Sets "key_value_metadata" property (defaults to `None`).
812    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
813        self.key_value_metadata = value;
814        self
815    }
816
817    /// Sets sorting order of rows in the row group if any (defaults to `None`).
818    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
819        self.sorting_columns = value;
820        self
821    }
822
823    /// Sets the max length of min/max value fields when writing the column
824    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
825    ///
826    /// This can be used to prevent columns with very long values (hundreds of
827    /// bytes long) from causing the parquet metadata to become huge.
828    ///
829    /// # Notes
830    ///
831    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
832    /// set to [`EnabledStatistics::Page`].
833    ///
834    /// * If `Some`, must be greater than 0, otherwise will panic
835    /// * If `None`, there's no effective limit.
836    ///
837    /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
838    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
839        if let Some(value) = max_length {
840            assert!(
841                value > 0,
842                "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
843            );
844        }
845
846        self.column_index_truncate_length = max_length;
847        self
848    }
849
850    /// Sets the max length of min/max value fields in row group and data page header
851    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
852    ///
853    /// # Notes
854    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
855    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
856    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
857    /// [`EnabledStatistics::Page`].
858    ///
859    /// * If `Some`, must be greater than 0, otherwise will panic
860    /// * If `None`, there's no effective limit.
861    ///
862    /// # See also
863    /// Truncation of Page Index statistics is controlled separately via
864    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
865    ///
866    /// [`Statistics`]: crate::file::statistics::Statistics
867    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
868        if let Some(value) = max_length {
869            assert!(
870                value > 0,
871                "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
872            );
873        }
874
875        self.statistics_truncate_length = max_length;
876        self
877    }
878
879    /// Should the writer coerce types to parquet native types (defaults to `false` via
880    /// [`DEFAULT_COERCE_TYPES`]).
881    ///
882    /// Leaving this option the default `false` will ensure the exact same data
883    /// written to parquet using this library will be read.
884    ///
885    /// Setting this option to `true` will result in parquet files that can be
886    /// read by more readers, but potentially lose information in the process.
887    ///
888    /// * Types such as [`DataType::Date64`], which have no direct corresponding
889    ///   Parquet type, may be stored with lower precision.
890    ///
891    /// * The internal field names of `List` and `Map` types will be renamed if
892    ///   necessary to match what is required by the newest Parquet specification.
893    ///
894    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
895    ///
896    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
897    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
898    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
899        self.coerce_types = coerce_types;
900        self
901    }
902
903    /// EXPERIMENTAL: Should the writer emit the `path_in_schema` element of the
904    /// `ColumnMetaData` Thrift struct. Defaults to `true` via [`DEFAULT_WRITE_PATH_IN_SCHEMA`].
905    ///
906    /// Because `path_in_schema` is a field on the `ColumnMetaData`, it is repeated
907    /// `num_columns * num_rowgroups` times. Compounding this is any level of nesting or
908    /// repetition in the schema. For instance, a top-level list column named `foo` will have
909    /// a `path_in_schema` of `["foo", "list", "element"]`. A list-of-struct is even worse,
910    /// because the necessary list wrapping is repeated for each element of the struct. A
911    /// file with a deeply nested schema and many row groups can have a large percentage of the
912    /// footer taken up by this field. For example, a file of 38 row groups with a schema containing
913    /// several lists of structs containing lists had 36% of the footer taken up by `path_in_schema`.
914    /// Removing this redundant information can greatly speed up footer parsing, which is particularly
915    /// important in scenarios where one does not wish to read the entire file (e.g. point
916    /// lookups).
917    ///
918    /// <div class="warning">
919    ///
920    /// **WARNING:**
921    /// Setting this to `false` will break compatibility with Parquet readers that
922    /// still expect this field to be present. Virtually all Parquet readers (parquet-java,
923    /// Spark, arrow-cpp, pyarrow, pandas to name a few), with the exception
924    /// of the one in this crate, expect this field to be present, and will terminate execution
925    /// if it is not. This will continue to be the case unless/until the Parquet format
926    /// specification is explicitly changed to allow this field to be missing. As a consquence,
927    /// users should only set this to `false` if they have verified that any reader(s) they plan
928    /// to use can tolerate the absence of this field.
929    ///
930    /// For more context, see [GH-563].
931    ///
932    /// </div>
933    ///
934    /// [GH-563]: https://github.com/apache/parquet-format/issues/563
935    pub fn set_write_path_in_schema(mut self, write_path_in_schema: bool) -> Self {
936        self.write_path_in_schema = write_path_in_schema;
937        self
938    }
939
940    /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
941    ///
942    /// When enabled, data page boundaries are determined by a rolling hash of the
943    /// column values, so unchanged data produces identical byte sequences across
944    /// file versions. This enables efficient deduplication on content-addressable
945    /// storage systems.
946    ///
947    /// Only supported through the Arrow writer interface ([`ArrowWriter`]).
948    ///
949    /// # Panics
950    ///
951    /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`.
952    ///
953    /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter
954    pub fn set_content_defined_chunking(mut self, options: Option<CdcOptions>) -> Self {
955        if let Some(ref options) = options {
956            assert!(
957                options.min_chunk_size > 0,
958                "min_chunk_size must be positive"
959            );
960            assert!(
961                options.max_chunk_size > options.min_chunk_size,
962                "max_chunk_size ({}) must be greater than min_chunk_size ({})",
963                options.max_chunk_size,
964                options.min_chunk_size
965            );
966        }
967        self.content_defined_chunking = options;
968        self
969    }
970
971    /// Sets the default compression ratio threshold at or above which a Data Page
972    /// v2's compressed values are discarded in favor of writing the values
973    /// uncompressed, for all columns (defaults to `1.0` via
974    /// [`DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD`]).
975    ///
976    /// When writing a Data Page v2 with a configured compression codec, the writer
977    /// first compresses the values and then compares the compressed size to the
978    /// uncompressed size. If `compressed_size >= uncompressed_size * threshold`, the
979    /// compressed buffer is discarded and the values are written uncompressed for
980    /// that page (the page's `is_compressed` flag is set to `false`).
981    ///
982    /// The default of `1.0` preserves the historical behavior of only keeping
983    /// compression when it strictly reduces the size. Setting a value below `1.0`
984    /// requires a minimum amount of size reduction to keep the compressed page —
985    /// for example `0.9` requires at least a 10% reduction. Setting a value above
986    /// `1.0` keeps the compressed buffer even if it's somewhat larger than the
987    /// uncompressed values.
988    ///
989    /// This setting only affects Data Page v2; Data Page v1 always stores the
990    /// compressor's output regardless of the resulting size.
991    ///
992    /// # Panics
993    /// If `value` is not finite or is not strictly positive.
994    pub fn set_data_page_v2_compression_ratio_threshold(mut self, value: f64) -> Self {
995        self.default_column_properties
996            .set_data_page_v2_compression_ratio_threshold(value);
997        self
998    }
999
1000    /// Sets FileEncryptionProperties (defaults to `None`)
1001    #[cfg(feature = "encryption")]
1002    pub fn with_file_encryption_properties(
1003        mut self,
1004        file_encryption_properties: Arc<FileEncryptionProperties>,
1005    ) -> Self {
1006        self.file_encryption_properties = Some(file_encryption_properties);
1007        self
1008    }
1009
1010    // ----------------------------------------------------------------------
1011    // Setters for any column (global)
1012
1013    /// Sets default encoding for all columns.
1014    ///
1015    /// If dictionary is not enabled, this is treated as a primary encoding for all
1016    /// columns. In case when dictionary is enabled for any column, this value is
1017    /// considered to be a fallback encoding for that column.
1018    ///
1019    /// # Panics
1020    ///
1021    /// if dictionary encoding is specified, regardless of dictionary
1022    /// encoding flag being set.
1023    pub fn set_encoding(mut self, value: Encoding) -> Self {
1024        self.default_column_properties.set_encoding(value);
1025        self
1026    }
1027
1028    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
1029    /// [`DEFAULT_COMPRESSION`]).
1030    ///
1031    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
1032    pub fn set_compression(mut self, value: Compression) -> Self {
1033        self.default_column_properties.set_compression(value);
1034        self
1035    }
1036
1037    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
1038    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
1039    ///
1040    /// Use this method to set dictionary encoding, instead of explicitly specifying
1041    /// encoding in `set_encoding` method.
1042    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
1043        self.default_column_properties.set_dictionary_enabled(value);
1044        self
1045    }
1046
1047    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
1048    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
1049    ///
1050    /// The parquet writer will attempt to limit the size of each
1051    /// `DataPage` used to store dictionaries to this many
1052    /// bytes. Reducing this value will result in larger parquet
1053    /// files, but may improve the effectiveness of page index based
1054    /// predicate pushdown during reading.
1055    ///
1056    /// Note: this is a best effort limit based on value of
1057    /// [`set_write_batch_size`](Self::set_write_batch_size).
1058    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
1059        self.default_column_properties
1060            .set_dictionary_page_size_limit(value);
1061        self
1062    }
1063
1064    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
1065    /// via [`DEFAULT_PAGE_SIZE`]).
1066    ///
1067    /// The parquet writer will attempt to limit the sizes of each
1068    /// `DataPage` to this many bytes. Reducing this value will result
1069    /// in larger parquet files, but may improve the effectiveness of
1070    /// page index based predicate pushdown during reading.
1071    ///
1072    /// Note: this is a best effort limit based on value of
1073    /// [`set_write_batch_size`](Self::set_write_batch_size).
1074    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
1075        self.default_column_properties
1076            .set_data_page_size_limit(value);
1077        self
1078    }
1079
1080    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
1081    /// [`DEFAULT_STATISTICS_ENABLED`]).
1082    ///
1083    /// [`Page`]: EnabledStatistics::Page
1084    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
1085        self.default_column_properties.set_statistics_enabled(value);
1086        self
1087    }
1088
1089    /// enable/disable writing [`Statistics`] in the page header
1090    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
1091    ///
1092    /// Only applicable if [`Page`] level statistics are gathered.
1093    ///
1094    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
1095    /// file while yielding very little added benefit. Most modern Parquet implementations
1096    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
1097    /// those in the page header.
1098    ///
1099    /// # Note
1100    ///
1101    /// Prior to version 56.0.0, the `parquet` crate always wrote these
1102    /// statistics (the equivalent of setting this option to `true`). This was
1103    /// changed in 56.0.0 to follow the recommendation in the Parquet
1104    /// specification. See [issue #7580] for more details.
1105    ///
1106    /// [`Statistics`]: crate::file::statistics::Statistics
1107    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1108    /// [`Page`]: EnabledStatistics::Page
1109    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
1110    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
1111        self.default_column_properties
1112            .set_write_page_header_statistics(value);
1113        self
1114    }
1115
1116    /// Sets if bloom filter should be written for all columns (defaults to `false`).
1117    ///
1118    /// # Notes
1119    ///
1120    /// * If the bloom filter is enabled previously then it is a no-op.
1121    ///
1122    /// * If the bloom filter is not enabled, default values for ndv and fpp
1123    ///   value are used used. See [`set_bloom_filter_max_ndv`] and
1124    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
1125    ///
1126    /// [`set_bloom_filter_max_ndv`]: Self::set_bloom_filter_max_ndv
1127    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
1128    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
1129        self.default_column_properties
1130            .set_bloom_filter_enabled(value);
1131        self
1132    }
1133
1134    /// Sets the default target bloom filter false positive probability (fpp)
1135    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
1136    ///
1137    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1138    /// been called.
1139    ///
1140    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1141    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
1142        self.default_column_properties.set_bloom_filter_fpp(value);
1143        self
1144    }
1145
1146    /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1147    /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
1148    ///
1149    /// The bloom filter is initially sized for this many distinct values at the
1150    /// configured FPP, then folded down after all values are inserted to achieve
1151    /// optimal size. A good heuristic is to set this to the expected number of rows
1152    /// in the row group.
1153    ///
1154    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1155    /// been called.
1156    ///
1157    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1158    pub fn set_bloom_filter_max_ndv(mut self, value: u64) -> Self {
1159        self.default_column_properties.set_bloom_filter_ndv(value);
1160        self
1161    }
1162
1163    /// Deprecated alias for [`Self::set_bloom_filter_max_ndv`].
1164    #[deprecated(since = "59.0.0", note = "Use `set_bloom_filter_max_ndv` instead")]
1165    pub fn set_bloom_filter_ndv(self, value: u64) -> Self {
1166        self.set_bloom_filter_max_ndv(value)
1167    }
1168
1169    // ----------------------------------------------------------------------
1170    // Setters for a specific column
1171
1172    /// Helper method to get existing or new mutable reference of column properties.
1173    #[inline]
1174    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
1175        self.column_properties.entry(col).or_default()
1176    }
1177
1178    /// Sets encoding for a specific column.
1179    ///
1180    /// Takes precedence over [`Self::set_encoding`].
1181    ///
1182    /// If dictionary is not enabled, this is treated as a primary encoding for this
1183    /// column. In case when dictionary is enabled for this column, either through
1184    /// global defaults or explicitly, this value is considered to be a fallback
1185    /// encoding for this column.
1186    ///
1187    /// # Panics
1188    /// If user tries to set dictionary encoding here, regardless of dictionary
1189    /// encoding flag being set.
1190    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
1191        self.get_mut_props(col).set_encoding(value);
1192        self
1193    }
1194
1195    /// Sets compression codec for a specific column.
1196    ///
1197    /// Takes precedence over [`Self::set_compression`].
1198    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
1199        self.get_mut_props(col).set_compression(value);
1200        self
1201    }
1202
1203    /// Sets flag to enable/disable dictionary encoding for a specific column.
1204    ///
1205    /// Takes precedence over [`Self::set_dictionary_enabled`].
1206    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1207        self.get_mut_props(col).set_dictionary_enabled(value);
1208        self
1209    }
1210
1211    /// Sets dictionary page size limit for a specific column.
1212    ///
1213    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
1214    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1215        self.get_mut_props(col)
1216            .set_dictionary_page_size_limit(value);
1217        self
1218    }
1219
1220    /// Sets data page size limit for a specific column.
1221    ///
1222    /// Takes precedence over [`Self::set_data_page_size_limit`].
1223    pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1224        self.get_mut_props(col).set_data_page_size_limit(value);
1225        self
1226    }
1227
1228    /// Sets [`EnabledStatistics`] level for a specific column.
1229    ///
1230    /// Takes precedence over [`Self::set_statistics_enabled`].
1231    pub fn set_column_statistics_enabled(
1232        mut self,
1233        col: ColumnPath,
1234        value: EnabledStatistics,
1235    ) -> Self {
1236        self.get_mut_props(col).set_statistics_enabled(value);
1237        self
1238    }
1239
1240    /// Sets whether to write [`Statistics`] in the page header for a specific column.
1241    ///
1242    /// Takes precedence over [`Self::set_write_page_header_statistics`].
1243    ///
1244    /// [`Statistics`]: crate::file::statistics::Statistics
1245    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
1246        self.get_mut_props(col)
1247            .set_write_page_header_statistics(value);
1248        self
1249    }
1250
1251    /// Sets whether a bloom filter should be written for a specific column.
1252    ///
1253    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
1254    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1255        self.get_mut_props(col).set_bloom_filter_enabled(value);
1256        self
1257    }
1258
1259    /// Sets the false positive probability for bloom filter for a specific column.
1260    ///
1261    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1262    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1263        self.get_mut_props(col).set_bloom_filter_fpp(value);
1264        self
1265    }
1266
1267    /// Sets the maximum expected number of distinct values for bloom filter for
1268    /// a specific column.
1269    ///
1270    /// Takes precedence over [`Self::set_bloom_filter_max_ndv`].
1271    pub fn set_column_bloom_filter_max_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1272        self.get_mut_props(col).set_bloom_filter_ndv(value);
1273        self
1274    }
1275
1276    /// Sets the Data Page v2 compression ratio threshold for a specific column.
1277    ///
1278    /// Takes precedence over [`Self::set_data_page_v2_compression_ratio_threshold`].
1279    ///
1280    /// # Panics
1281    /// If `value` is not finite or is not strictly positive.
1282    pub fn set_column_data_page_v2_compression_ratio_threshold(
1283        mut self,
1284        col: ColumnPath,
1285        value: f64,
1286    ) -> Self {
1287        self.get_mut_props(col)
1288            .set_data_page_v2_compression_ratio_threshold(value);
1289        self
1290    }
1291
1292    /// Deprecated alias for [`Self::set_column_bloom_filter_max_ndv`].
1293    #[deprecated(
1294        since = "59.0.0",
1295        note = "Use `set_column_bloom_filter_max_ndv` instead"
1296    )]
1297    pub fn set_column_bloom_filter_ndv(self, col: ColumnPath, value: u64) -> Self {
1298        self.set_column_bloom_filter_max_ndv(col, value)
1299    }
1300
1301    /// Sets the [`BloomFilterProperties`] for all columns, implicitly enabling
1302    /// the bloom filter.
1303    ///
1304    /// Both `fpp` and `ndv` from `value` are treated as explicit and will not
1305    /// be overridden by the build-time row-group-size NDV fallback. For
1306    /// dynamic NDV sizing (resolved to `max_row_group_row_count` at build
1307    /// time), use [`Self::set_bloom_filter_enabled`] or
1308    /// [`Self::set_bloom_filter_fpp`] instead.
1309    pub fn set_bloom_filter_properties(mut self, value: BloomFilterProperties) -> Self {
1310        self.default_column_properties
1311            .set_bloom_filter_properties(value);
1312        self
1313    }
1314
1315    /// Sets the [`BloomFilterProperties`] for a specific column.
1316    ///
1317    /// Takes precedence over [`Self::set_bloom_filter_properties`].
1318    pub fn set_column_bloom_filter_properties(
1319        mut self,
1320        col: ColumnPath,
1321        value: BloomFilterProperties,
1322    ) -> Self {
1323        self.get_mut_props(col).set_bloom_filter_properties(value);
1324        self
1325    }
1326}
1327
1328impl From<WriterProperties> for WriterPropertiesBuilder {
1329    fn from(props: WriterProperties) -> Self {
1330        WriterPropertiesBuilder {
1331            data_page_row_count_limit: props.data_page_row_count_limit,
1332            write_batch_size: props.write_batch_size,
1333            max_row_group_row_count: props.max_row_group_row_count,
1334            max_row_group_bytes: props.max_row_group_bytes,
1335            bloom_filter_position: props.bloom_filter_position,
1336            writer_version: props.writer_version,
1337            created_by: props.created_by,
1338            offset_index_disabled: !matches!(
1339                props.offset_index_setting,
1340                OffsetIndexSetting::Enabled
1341            ),
1342            key_value_metadata: props.key_value_metadata,
1343            default_column_properties: props.default_column_properties,
1344            column_properties: props.column_properties,
1345            sorting_columns: props.sorting_columns,
1346            column_index_truncate_length: props.column_index_truncate_length,
1347            statistics_truncate_length: props.statistics_truncate_length,
1348            coerce_types: props.coerce_types,
1349            content_defined_chunking: props.content_defined_chunking,
1350            write_path_in_schema: props.write_path_in_schema,
1351            #[cfg(feature = "encryption")]
1352            file_encryption_properties: props.file_encryption_properties,
1353        }
1354    }
1355}
1356
1357/// Controls the level of statistics to be computed by the writer and stored in
1358/// the parquet file.
1359///
1360/// Enabling statistics makes the resulting Parquet file larger and requires
1361/// more time to read the parquet footer.
1362///
1363/// Statistics can be used to improve query performance by pruning row groups
1364/// and pages during query execution if the query engine supports evaluating the
1365/// predicate using the statistics.
1366#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1367pub enum EnabledStatistics {
1368    /// Compute no statistics.
1369    None,
1370    /// Compute column chunk-level statistics but not page-level.
1371    ///
1372    /// Setting this option will store one set of statistics for each relevant
1373    /// column for each row group. The more row groups written, the more
1374    /// statistics will be stored.
1375    Chunk,
1376    /// Compute page-level and column chunk-level statistics.
1377    ///
1378    /// Setting this option will store one set of statistics for each relevant
1379    /// column for each row group. In addition, this will enable the writing
1380    /// of the column index (the offset index is always written regardless of
1381    /// this setting). See [`ParquetColumnIndex`] for
1382    /// more information.
1383    ///
1384    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1385    Page,
1386}
1387
1388impl FromStr for EnabledStatistics {
1389    type Err = String;
1390
1391    fn from_str(s: &str) -> Result<Self, Self::Err> {
1392        match s {
1393            "NONE" | "none" => Ok(EnabledStatistics::None),
1394            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1395            "PAGE" | "page" => Ok(EnabledStatistics::Page),
1396            _ => Err(format!("Invalid statistics arg: {s}")),
1397        }
1398    }
1399}
1400
1401impl Default for EnabledStatistics {
1402    fn default() -> Self {
1403        DEFAULT_STATISTICS_ENABLED
1404    }
1405}
1406
1407/// Controls the bloom filter to be computed by the writer.
1408///
1409/// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1410/// automatically folded down after all values are inserted to achieve optimal size while
1411/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1412/// folding algorithm.
1413///
1414/// # Example
1415///
1416/// ```rust
1417/// # use parquet::{
1418/// #    file::properties::{BloomFilterProperties, WriterProperties},
1419/// #    schema::types::ColumnPath,
1420/// # };
1421/// // Build a BloomFilterProperties via the builder, then apply it to one column.
1422/// let bf = BloomFilterProperties::builder()
1423///     .with_fpp(0.01)
1424///     .with_max_ndv(10_000)
1425///     .build();
1426///
1427/// let props = WriterProperties::builder()
1428///     .set_column_bloom_filter_properties(ColumnPath::from("user_id"), bf.clone())
1429///     .build();
1430///
1431/// assert_eq!(
1432///     props.bloom_filter_properties(&ColumnPath::from("user_id")),
1433///     Some(&bf)
1434/// );
1435/// ```
1436///
1437/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
1438#[derive(Debug, Clone, PartialEq)]
1439pub struct BloomFilterProperties {
1440    fpp: f64,
1441    ndv: u64,
1442}
1443
1444impl Default for BloomFilterProperties {
1445    fn default() -> Self {
1446        BloomFilterProperties {
1447            fpp: DEFAULT_BLOOM_FILTER_FPP,
1448            ndv: DEFAULT_BLOOM_FILTER_NDV,
1449        }
1450    }
1451}
1452
1453impl BloomFilterProperties {
1454    /// Returns a new [`BloomFilterPropertiesBuilder`] for constructing
1455    /// [`BloomFilterProperties`] with custom values.
1456    pub fn builder() -> BloomFilterPropertiesBuilder {
1457        BloomFilterPropertiesBuilder::new()
1458    }
1459
1460    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1461    ///
1462    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1463    ///
1464    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1465    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1466    /// e.g. 0.1, 0.05, or 0.001 is recommended.
1467    ///
1468    /// This value also serves as the target FPP for bloom filter folding: after all values
1469    /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
1470    pub fn fpp(&self) -> f64 {
1471        self.fpp
1472    }
1473
1474    /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1475    ///
1476    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_max_ndv`].
1477    ///
1478    /// When not explicitly set via the builder, this defaults to
1479    /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1480    /// build time). The bloom filter is initially sized for this many distinct values at the
1481    /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1482    /// is to set this to the expected number of rows in the row group. If fewer distinct values
1483    /// are actually written, the filter will be automatically compacted via folding.
1484    ///
1485    /// Thus the only negative side of overestimating this value is that the bloom filter
1486    /// will use more memory during writing than necessary, but it will not affect the final
1487    /// bloom filter size on disk.
1488    ///
1489    /// If you wish to reduce memory usage during writing and are able to make a reasonable estimate
1490    /// of the number of distinct values in a row group, it is recommended to set this value explicitly
1491    /// rather than relying on the default dynamic sizing based on `max_row_group_row_count`.
1492    /// If you do set this value explicitly it is probably best to set it for each column
1493    /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_max_ndv`] rather than globally,
1494    /// since different columns may have different numbers of distinct values.
1495    pub fn ndv(&self) -> u64 {
1496        self.ndv
1497    }
1498}
1499
1500/// Builder for [`BloomFilterProperties`].
1501///
1502/// Use [`BloomFilterProperties::builder`] or [`BloomFilterPropertiesBuilder::new`]
1503/// as the entry point.
1504#[derive(Debug, Clone, Default)]
1505pub struct BloomFilterPropertiesBuilder {
1506    fpp: Option<f64>,
1507    ndv: Option<u64>,
1508}
1509
1510impl BloomFilterPropertiesBuilder {
1511    /// Returns a new builder with no fields set.
1512    ///
1513    /// Equivalent to [`BloomFilterProperties::builder`].
1514    pub fn new() -> Self {
1515        Self::default()
1516    }
1517
1518    /// Sets the target false positive probability.
1519    ///
1520    /// The value must be in `(0.0, 1.0)` exclusively; this is validated at
1521    /// build time by [`Self::build`] / [`Self::try_build`]. When unset, the
1522    /// default is `0.05` (5%, see [`DEFAULT_BLOOM_FILTER_FPP`]).
1523    pub fn with_fpp(mut self, fpp: f64) -> Self {
1524        self.fpp = Some(fpp);
1525        self
1526    }
1527
1528    /// Sets the maximum expected number of distinct values used to size the
1529    /// bloom filter before folding.
1530    ///
1531    /// When unset, the default is `1_048_576` (see [`DEFAULT_BLOOM_FILTER_NDV`]),
1532    /// which at the default fpp of 5% reserves roughly 1 MiB per column for the
1533    /// filter bitset, derived as follows:
1534    ///
1535    /// ```text
1536    /// ndv = 1,048,576, fpp = 0.05
1537    ///   0.05^(1/8)                         ≈ 0.6877
1538    ///   1 - 0.6877                         ≈ 0.3123
1539    ///   ln(0.3123)                         ≈ -1.164
1540    ///   num_bits = -8 * 1,048,576 / -1.164 ≈ 7,206,000 bits
1541    ///                                      ≈   900,750 bytes (~900 KB)
1542    ///   next_power_of_two(900 KB)          = 1 MiB (= 1,048,576 bytes)
1543    /// ```
1544    pub fn with_max_ndv(mut self, ndv: u64) -> Self {
1545        self.ndv = Some(ndv);
1546        self
1547    }
1548
1549    /// Builds [`BloomFilterProperties`].
1550    ///
1551    /// Panics if the configured `fpp` is not in `(0.0, 1.0)` exclusive.
1552    /// Use [`Self::try_build`] for a non-panicking alternative.
1553    pub fn build(self) -> BloomFilterProperties {
1554        self.try_build().unwrap_or_else(|e| panic!("{e}"))
1555    }
1556
1557    /// Builds [`BloomFilterProperties`], returning an error instead of
1558    /// panicking when the configured `fpp` is not in `(0.0, 1.0)` exclusive.
1559    pub fn try_build(self) -> Result<BloomFilterProperties> {
1560        let fpp = self.fpp.unwrap_or(DEFAULT_BLOOM_FILTER_FPP);
1561        validate_bloom_filter_fpp(fpp).map_err(ParquetError::General)?;
1562        let ndv = self.ndv.unwrap_or(DEFAULT_BLOOM_FILTER_NDV);
1563        Ok(BloomFilterProperties { fpp, ndv })
1564    }
1565}
1566
1567/// Single source of truth for the bloom filter fpp range check, shared by
1568/// [`ColumnProperties::set_bloom_filter_fpp`] (panic path) and
1569/// [`BloomFilterPropertiesBuilder::try_build`] (Result path).
1570fn validate_bloom_filter_fpp(fpp: f64) -> std::result::Result<(), String> {
1571    if !(fpp > 0.0 && fpp < 1.0) {
1572        return Err(format!(
1573            "fpp must be between 0.0 and 1.0 exclusive, got {fpp}"
1574        ));
1575    }
1576    Ok(())
1577}
1578
1579/// Container for column properties that can be changed as part of writer.
1580///
1581/// If a field is `None`, it means that no specific value has been set for this column,
1582/// so some subsequent or default value must be used.
1583#[derive(Debug, Clone, Default, PartialEq)]
1584struct ColumnProperties {
1585    encoding: Option<Encoding>,
1586    codec: Option<Compression>,
1587    data_page_size_limit: Option<usize>,
1588    dictionary_page_size_limit: Option<usize>,
1589    dictionary_enabled: Option<bool>,
1590    statistics_enabled: Option<EnabledStatistics>,
1591    write_page_header_statistics: Option<bool>,
1592    /// bloom filter related properties
1593    bloom_filter_properties: Option<BloomFilterProperties>,
1594    /// Whether the bloom filter NDV was explicitly set by the user
1595    bloom_filter_ndv_is_set: bool,
1596    data_page_v2_compression_ratio_threshold: Option<f64>,
1597}
1598
1599impl ColumnProperties {
1600    /// Sets encoding for this column.
1601    ///
1602    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1603    /// In case when dictionary is enabled for a column, this value is considered to
1604    /// be a fallback encoding.
1605    ///
1606    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1607    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1608    /// for a column.
1609    fn set_encoding(&mut self, value: Encoding) {
1610        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1611            panic!("Dictionary encoding can not be used as fallback encoding");
1612        }
1613        self.encoding = Some(value);
1614    }
1615
1616    /// Sets compression codec for this column.
1617    fn set_compression(&mut self, value: Compression) {
1618        self.codec = Some(value);
1619    }
1620
1621    /// Sets data page size limit for this column.
1622    fn set_data_page_size_limit(&mut self, value: usize) {
1623        self.data_page_size_limit = Some(value);
1624    }
1625
1626    /// Sets whether dictionary encoding is enabled for this column.
1627    fn set_dictionary_enabled(&mut self, enabled: bool) {
1628        self.dictionary_enabled = Some(enabled);
1629    }
1630
1631    /// Sets dictionary page size limit for this column.
1632    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1633        self.dictionary_page_size_limit = Some(value);
1634    }
1635
1636    /// Sets the statistics level for this column.
1637    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1638        self.statistics_enabled = Some(enabled);
1639    }
1640
1641    /// Sets whether to write statistics in the page header for this column.
1642    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1643        self.write_page_header_statistics = Some(enabled);
1644    }
1645
1646    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1647    /// otherwise it is a no-op.
1648    /// If `value` is `false`, resets bloom filter properties to `None`.
1649    fn set_bloom_filter_enabled(&mut self, value: bool) {
1650        if value && self.bloom_filter_properties.is_none() {
1651            self.bloom_filter_properties = Some(Default::default())
1652        } else if !value {
1653            self.bloom_filter_properties = None
1654        }
1655    }
1656
1657    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1658    /// bloom filter if not previously enabled.
1659    ///
1660    /// # Panics
1661    ///
1662    /// Panics if the `value` is not between 0 and 1 exclusive
1663    fn set_bloom_filter_fpp(&mut self, value: f64) {
1664        if let Err(msg) = validate_bloom_filter_fpp(value) {
1665            panic!("{msg}");
1666        }
1667        self.bloom_filter_properties
1668            .get_or_insert_with(Default::default)
1669            .fpp = value;
1670    }
1671
1672    /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1673    /// column, and implicitly enables bloom filter if not previously enabled.
1674    fn set_bloom_filter_ndv(&mut self, value: u64) {
1675        self.bloom_filter_properties
1676            .get_or_insert_with(Default::default)
1677            .ndv = value;
1678        self.bloom_filter_ndv_is_set = true;
1679    }
1680
1681    /// Sets the bloom filter properties for this column from a fully-built
1682    /// [`BloomFilterProperties`], implicitly enabling the bloom filter.
1683    ///
1684    /// Both `fpp` and `ndv` from `value` are treated as explicit, so the
1685    /// build-time row-group-size NDV fallback in
1686    /// [`WriterPropertiesBuilder::build`] will not override them.
1687    fn set_bloom_filter_properties(&mut self, value: BloomFilterProperties) {
1688        self.bloom_filter_properties = Some(value);
1689        self.bloom_filter_ndv_is_set = true;
1690    }
1691
1692    /// Sets the Data Page v2 compression ratio threshold for this column.
1693    ///
1694    /// # Panics
1695    /// If `value` is not finite or is not strictly positive.
1696    fn set_data_page_v2_compression_ratio_threshold(&mut self, value: f64) {
1697        assert!(
1698            value.is_finite() && value > 0.0,
1699            "data_page_v2_compression_ratio_threshold must be a positive finite number, got {value}"
1700        );
1701        self.data_page_v2_compression_ratio_threshold = Some(value);
1702    }
1703
1704    /// Returns optional encoding for this column.
1705    fn encoding(&self) -> Option<Encoding> {
1706        self.encoding
1707    }
1708
1709    /// Returns optional compression codec for this column.
1710    fn compression(&self) -> Option<Compression> {
1711        self.codec
1712    }
1713
1714    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1715    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1716    /// been provided.
1717    fn dictionary_enabled(&self) -> Option<bool> {
1718        self.dictionary_enabled
1719    }
1720
1721    /// Returns optional dictionary page size limit for this column.
1722    fn dictionary_page_size_limit(&self) -> Option<usize> {
1723        self.dictionary_page_size_limit
1724    }
1725
1726    /// Returns optional data page size limit for this column.
1727    fn data_page_size_limit(&self) -> Option<usize> {
1728        self.data_page_size_limit
1729    }
1730
1731    /// Returns optional statistics level requested for this column. If result is `None`,
1732    /// then no setting has been provided.
1733    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1734        self.statistics_enabled
1735    }
1736
1737    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1738    /// column.
1739    ///
1740    /// [`Statistics`]: crate::file::statistics::Statistics
1741    fn write_page_header_statistics(&self) -> Option<bool> {
1742        self.write_page_header_statistics
1743    }
1744
1745    /// Returns the bloom filter properties, or `None` if not enabled
1746    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1747        self.bloom_filter_properties.as_ref()
1748    }
1749
1750    /// Returns optional Data Page v2 compression ratio threshold for this column.
1751    fn data_page_v2_compression_ratio_threshold(&self) -> Option<f64> {
1752        self.data_page_v2_compression_ratio_threshold
1753    }
1754
1755    /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1756    /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1757    fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1758        if !self.bloom_filter_ndv_is_set {
1759            if let Some(ref mut bf) = self.bloom_filter_properties {
1760                bf.ndv = default_ndv;
1761            }
1762        }
1763    }
1764}
1765
1766/// Reference counted reader properties.
1767pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1768
1769const DEFAULT_READ_BLOOM_FILTER: bool = false;
1770const DEFAULT_READ_PAGE_STATS: bool = false;
1771
1772/// Configuration settings for reading parquet files.
1773///
1774/// All properties are immutable and `Send` + `Sync`.
1775/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1776///
1777/// # Example
1778///
1779/// ```rust
1780/// use parquet::file::properties::ReaderProperties;
1781///
1782/// // Create properties with default configuration.
1783/// let props = ReaderProperties::builder().build();
1784///
1785/// // Use properties builder to set certain options and assemble the configuration.
1786/// let props = ReaderProperties::builder()
1787///     .set_backward_compatible_lz4(false)
1788///     .build();
1789/// ```
1790pub struct ReaderProperties {
1791    codec_options: CodecOptions,
1792    read_bloom_filter: bool,
1793    read_page_stats: bool,
1794}
1795
1796impl ReaderProperties {
1797    /// Returns builder for reader properties with default values.
1798    pub fn builder() -> ReaderPropertiesBuilder {
1799        ReaderPropertiesBuilder::with_defaults()
1800    }
1801
1802    /// Returns codec options.
1803    pub(crate) fn codec_options(&self) -> &CodecOptions {
1804        &self.codec_options
1805    }
1806
1807    /// Returns whether to read bloom filter
1808    pub(crate) fn read_bloom_filter(&self) -> bool {
1809        self.read_bloom_filter
1810    }
1811
1812    /// Returns whether to read page level statistics
1813    pub(crate) fn read_page_stats(&self) -> bool {
1814        self.read_page_stats
1815    }
1816}
1817
1818/// Builder for parquet file reader configuration. See example on
1819/// [`ReaderProperties`]
1820pub struct ReaderPropertiesBuilder {
1821    codec_options_builder: CodecOptionsBuilder,
1822    read_bloom_filter: Option<bool>,
1823    read_page_stats: Option<bool>,
1824}
1825
1826/// Reader properties builder.
1827impl ReaderPropertiesBuilder {
1828    /// Returns default state of the builder.
1829    fn with_defaults() -> Self {
1830        Self {
1831            codec_options_builder: CodecOptionsBuilder::default(),
1832            read_bloom_filter: None,
1833            read_page_stats: None,
1834        }
1835    }
1836
1837    /// Finalizes the configuration and returns immutable reader properties struct.
1838    pub fn build(self) -> ReaderProperties {
1839        ReaderProperties {
1840            codec_options: self.codec_options_builder.build(),
1841            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1842            read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1843        }
1844    }
1845
1846    /// Enable/disable backward compatible LZ4.
1847    ///
1848    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1849    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1850    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1851    /// compatibility with files generated by older versions of parquet-cpp.
1852    ///
1853    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1854    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1855        self.codec_options_builder = self
1856            .codec_options_builder
1857            .set_backward_compatible_lz4(value);
1858        self
1859    }
1860
1861    /// Enable/disable reading bloom filter
1862    ///
1863    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1864    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1865    ///
1866    /// By default bloom filter is set to be read.
1867    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1868        self.read_bloom_filter = Some(value);
1869        self
1870    }
1871
1872    /// Enable/disable reading page-level statistics
1873    ///
1874    /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1875    /// each page, if present.
1876    /// If set to `false`, then the reader will skip decoding the statistics.
1877    ///
1878    /// By default statistics will not be decoded.
1879    ///
1880    /// [`Statistics`]: crate::file::statistics::Statistics
1881    pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1882        self.read_page_stats = Some(value);
1883        self
1884    }
1885}
1886
1887#[cfg(test)]
1888mod tests {
1889    use super::*;
1890
1891    #[test]
1892    fn test_writer_version() {
1893        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1894        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1895    }
1896
1897    #[test]
1898    fn test_writer_properties_default_settings() {
1899        let props = WriterProperties::default();
1900        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1901        assert_eq!(
1902            props.dictionary_page_size_limit(),
1903            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1904        );
1905        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1906        assert_eq!(
1907            props.max_row_group_row_count(),
1908            Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1909        );
1910        assert_eq!(props.max_row_group_bytes(), None);
1911        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1912        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1913        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1914        assert_eq!(props.key_value_metadata(), None);
1915        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1916        assert_eq!(
1917            props.compression(&ColumnPath::from("col")),
1918            DEFAULT_COMPRESSION
1919        );
1920        assert_eq!(
1921            props.dictionary_enabled(&ColumnPath::from("col")),
1922            DEFAULT_DICTIONARY_ENABLED
1923        );
1924        assert_eq!(
1925            props.statistics_enabled(&ColumnPath::from("col")),
1926            DEFAULT_STATISTICS_ENABLED
1927        );
1928        assert!(
1929            props
1930                .bloom_filter_properties(&ColumnPath::from("col"))
1931                .is_none()
1932        );
1933    }
1934
1935    #[test]
1936    fn test_writer_properties_dictionary_encoding() {
1937        // dictionary encoding is not configurable, and it should be the same for both
1938        // writer version 1 and 2.
1939        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1940            let props = WriterProperties::builder()
1941                .set_writer_version(*version)
1942                .build();
1943            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1944            assert_eq!(
1945                props.dictionary_data_page_encoding(),
1946                Encoding::RLE_DICTIONARY
1947            );
1948        }
1949    }
1950
1951    #[test]
1952    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1953    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1954        // Should panic when user specifies dictionary encoding as fallback encoding.
1955        WriterProperties::builder()
1956            .set_encoding(Encoding::PLAIN_DICTIONARY)
1957            .build();
1958    }
1959
1960    #[test]
1961    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1962    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1963        // Should panic when user specifies dictionary encoding as fallback encoding.
1964        WriterProperties::builder()
1965            .set_encoding(Encoding::RLE_DICTIONARY)
1966            .build();
1967    }
1968
1969    #[test]
1970    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1971    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1972        WriterProperties::builder()
1973            .set_dictionary_enabled(true)
1974            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1975            .build();
1976    }
1977
1978    #[test]
1979    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1980    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1981        WriterProperties::builder()
1982            .set_dictionary_enabled(false)
1983            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1984            .build();
1985    }
1986
1987    #[test]
1988    fn test_writer_properties_builder() {
1989        let props = WriterProperties::builder()
1990            // file settings
1991            .set_writer_version(WriterVersion::PARQUET_2_0)
1992            .set_data_page_size_limit(10)
1993            .set_dictionary_page_size_limit(20)
1994            .set_write_batch_size(30)
1995            .set_max_row_group_row_count(Some(40))
1996            .set_created_by("default".to_owned())
1997            .set_key_value_metadata(Some(vec![KeyValue::new(
1998                "key".to_string(),
1999                "value".to_string(),
2000            )]))
2001            // global column settings
2002            .set_encoding(Encoding::DELTA_BINARY_PACKED)
2003            .set_compression(Compression::GZIP(Default::default()))
2004            .set_dictionary_enabled(false)
2005            .set_statistics_enabled(EnabledStatistics::None)
2006            // specific column settings
2007            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
2008            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
2009            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
2010            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
2011            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
2012            .set_column_bloom_filter_max_ndv(ColumnPath::from("col"), 100_u64)
2013            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
2014            .build();
2015
2016        fn test_props(props: &WriterProperties) {
2017            assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
2018            assert_eq!(props.data_page_size_limit(), 10);
2019            assert_eq!(props.dictionary_page_size_limit(), 20);
2020            assert_eq!(props.write_batch_size(), 30);
2021            assert_eq!(props.max_row_group_row_count(), Some(40));
2022            assert_eq!(props.created_by(), "default");
2023            assert_eq!(
2024                props.key_value_metadata(),
2025                Some(&vec![
2026                    KeyValue::new("key".to_string(), "value".to_string(),)
2027                ])
2028            );
2029
2030            assert_eq!(
2031                props.encoding(&ColumnPath::from("a")),
2032                Some(Encoding::DELTA_BINARY_PACKED)
2033            );
2034            assert_eq!(
2035                props.compression(&ColumnPath::from("a")),
2036                Compression::GZIP(Default::default())
2037            );
2038            assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
2039            assert_eq!(
2040                props.statistics_enabled(&ColumnPath::from("a")),
2041                EnabledStatistics::None
2042            );
2043
2044            assert_eq!(
2045                props.encoding(&ColumnPath::from("col")),
2046                Some(Encoding::RLE)
2047            );
2048            assert_eq!(
2049                props.compression(&ColumnPath::from("col")),
2050                Compression::SNAPPY
2051            );
2052            assert!(props.dictionary_enabled(&ColumnPath::from("col")));
2053            assert_eq!(
2054                props.statistics_enabled(&ColumnPath::from("col")),
2055                EnabledStatistics::Chunk
2056            );
2057            assert_eq!(
2058                props.bloom_filter_properties(&ColumnPath::from("col")),
2059                Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
2060            );
2061        }
2062
2063        // Test direct build of properties
2064        test_props(&props);
2065
2066        // Test that into_builder() gives the same result
2067        let props_into_builder_and_back = props.into_builder().build();
2068        test_props(&props_into_builder_and_back);
2069    }
2070
2071    #[test]
2072    fn test_writer_properties_builder_partial_defaults() {
2073        let props = WriterProperties::builder()
2074            .set_encoding(Encoding::DELTA_BINARY_PACKED)
2075            .set_compression(Compression::GZIP(Default::default()))
2076            .set_bloom_filter_enabled(true)
2077            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
2078            .build();
2079
2080        assert_eq!(
2081            props.encoding(&ColumnPath::from("col")),
2082            Some(Encoding::RLE)
2083        );
2084        assert_eq!(
2085            props.compression(&ColumnPath::from("col")),
2086            Compression::GZIP(Default::default())
2087        );
2088        assert_eq!(
2089            props.dictionary_enabled(&ColumnPath::from("col")),
2090            DEFAULT_DICTIONARY_ENABLED
2091        );
2092        assert_eq!(
2093            props.bloom_filter_properties(&ColumnPath::from("col")),
2094            Some(&BloomFilterProperties {
2095                fpp: DEFAULT_BLOOM_FILTER_FPP,
2096                ndv: DEFAULT_BLOOM_FILTER_NDV,
2097            })
2098        );
2099    }
2100
2101    #[test]
2102    #[allow(deprecated)]
2103    fn test_writer_properties_deprecated_max_row_group_size_still_works() {
2104        let props = WriterProperties::builder()
2105            .set_max_row_group_size(42)
2106            .build();
2107
2108        assert_eq!(props.max_row_group_row_count(), Some(42));
2109        assert_eq!(props.max_row_group_size(), 42);
2110    }
2111
2112    #[test]
2113    #[should_panic(expected = "Cannot have a 0 max row group row count")]
2114    fn test_writer_properties_panic_on_zero_row_group_row_count() {
2115        let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
2116    }
2117
2118    #[test]
2119    #[should_panic(expected = "Cannot have a 0 max row group bytes")]
2120    fn test_writer_properties_panic_on_zero_row_group_bytes() {
2121        let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
2122    }
2123
2124    #[test]
2125    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
2126        assert_eq!(
2127            WriterProperties::builder()
2128                .build()
2129                .bloom_filter_properties(&ColumnPath::from("col")),
2130            None
2131        );
2132        assert_eq!(
2133            WriterProperties::builder()
2134                .set_bloom_filter_max_ndv(100)
2135                .build()
2136                .bloom_filter_properties(&ColumnPath::from("col")),
2137            Some(&BloomFilterProperties {
2138                fpp: DEFAULT_BLOOM_FILTER_FPP,
2139                ndv: 100,
2140            })
2141        );
2142        assert_eq!(
2143            WriterProperties::builder()
2144                .set_bloom_filter_fpp(0.1)
2145                .build()
2146                .bloom_filter_properties(&ColumnPath::from("col")),
2147            Some(&BloomFilterProperties {
2148                fpp: 0.1,
2149                ndv: DEFAULT_BLOOM_FILTER_NDV,
2150            })
2151        );
2152    }
2153
2154    #[test]
2155    fn test_writer_properties_column_data_page_v2_compression_ratio_threshold() {
2156        let props = WriterProperties::builder()
2157            .set_data_page_v2_compression_ratio_threshold(0.5)
2158            .set_column_data_page_v2_compression_ratio_threshold(ColumnPath::from("col"), 0.1)
2159            .build();
2160
2161        assert_eq!(props.data_page_v2_compression_ratio_threshold(), 0.5);
2162        assert_eq!(
2163            props.column_data_page_v2_compression_ratio_threshold(&ColumnPath::from("col")),
2164            0.1
2165        );
2166        assert_eq!(
2167            props.column_data_page_v2_compression_ratio_threshold(&ColumnPath::from("other")),
2168            0.5
2169        );
2170    }
2171
2172    #[test]
2173    #[should_panic(
2174        expected = "data_page_v2_compression_ratio_threshold must be a positive finite number"
2175    )]
2176    fn test_writer_properties_panic_on_invalid_data_page_v2_compression_ratio_threshold() {
2177        WriterProperties::builder()
2178            .set_data_page_v2_compression_ratio_threshold(0.0)
2179            .build();
2180    }
2181
2182    #[test]
2183    #[allow(deprecated)]
2184    fn test_writer_properties_deprecated_bloom_filter_ndv_setters_still_work() {
2185        let col = ColumnPath::from("col");
2186        let props = WriterProperties::builder()
2187            .set_bloom_filter_ndv(100)
2188            .set_column_bloom_filter_ndv(col.clone(), 200)
2189            .build();
2190        assert_eq!(
2191            props.bloom_filter_properties(&ColumnPath::from("other")),
2192            Some(&BloomFilterProperties {
2193                fpp: DEFAULT_BLOOM_FILTER_FPP,
2194                ndv: 100,
2195            })
2196        );
2197        assert_eq!(
2198            props.bloom_filter_properties(&col),
2199            Some(&BloomFilterProperties {
2200                fpp: DEFAULT_BLOOM_FILTER_FPP,
2201                ndv: 200,
2202            })
2203        );
2204    }
2205
2206    #[test]
2207    fn test_writer_properties_column_dictionary_page_size_limit() {
2208        let props = WriterProperties::builder()
2209            .set_dictionary_page_size_limit(100)
2210            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
2211            .build();
2212
2213        assert_eq!(props.dictionary_page_size_limit(), 100);
2214        assert_eq!(
2215            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
2216            10
2217        );
2218        assert_eq!(
2219            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
2220            100
2221        );
2222    }
2223
2224    #[test]
2225    fn test_writer_properties_column_data_page_size_limit() {
2226        let props = WriterProperties::builder()
2227            .set_data_page_size_limit(100)
2228            .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
2229            .build();
2230
2231        assert_eq!(props.data_page_size_limit(), 100);
2232        assert_eq!(
2233            props.column_data_page_size_limit(&ColumnPath::from("col")),
2234            10
2235        );
2236        assert_eq!(
2237            props.column_data_page_size_limit(&ColumnPath::from("other")),
2238            100
2239        );
2240    }
2241
2242    #[test]
2243    fn test_reader_properties_default_settings() {
2244        let props = ReaderProperties::builder().build();
2245
2246        let codec_options = CodecOptionsBuilder::default()
2247            .set_backward_compatible_lz4(true)
2248            .build();
2249
2250        assert_eq!(props.codec_options(), &codec_options);
2251        assert!(!props.read_bloom_filter());
2252    }
2253
2254    #[test]
2255    fn test_reader_properties_builder() {
2256        let props = ReaderProperties::builder()
2257            .set_backward_compatible_lz4(false)
2258            .build();
2259
2260        let codec_options = CodecOptionsBuilder::default()
2261            .set_backward_compatible_lz4(false)
2262            .build();
2263
2264        assert_eq!(props.codec_options(), &codec_options);
2265    }
2266
2267    #[test]
2268    fn test_parse_writerversion() {
2269        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
2270        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
2271        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
2272        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
2273
2274        // test lowercase
2275        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
2276        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
2277
2278        // test invalid version
2279        match "PARQUET_-1_0".parse::<WriterVersion>() {
2280            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
2281            Err(e) => {
2282                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
2283            }
2284        }
2285    }
2286
2287    #[test]
2288    fn test_parse_enabledstatistics() {
2289        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
2290        assert_eq!(enabled_statistics, EnabledStatistics::None);
2291        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
2292        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
2293        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
2294        assert_eq!(enabled_statistics, EnabledStatistics::Page);
2295
2296        // test lowercase
2297        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
2298        assert_eq!(enabled_statistics, EnabledStatistics::None);
2299
2300        //test invalid statistics
2301        match "ChunkAndPage".parse::<EnabledStatistics>() {
2302            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
2303            Err(e) => {
2304                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
2305            }
2306        }
2307    }
2308
2309    #[test]
2310    fn test_cdc_options_equality() {
2311        let opts = CdcOptions::default();
2312        assert_eq!(opts, CdcOptions::default());
2313
2314        let custom = CdcOptions {
2315            min_chunk_size: 1024,
2316            max_chunk_size: 8192,
2317            norm_level: 1,
2318        };
2319        assert_eq!(custom, custom);
2320        assert_ne!(opts, custom);
2321    }
2322
2323    #[test]
2324    fn test_bloom_filter_builder_default() {
2325        let props = BloomFilterProperties::builder().build();
2326        assert_eq!(props.fpp, DEFAULT_BLOOM_FILTER_FPP);
2327        assert_eq!(props.ndv, DEFAULT_BLOOM_FILTER_NDV);
2328        assert_eq!(props, BloomFilterProperties::default());
2329        assert_eq!(
2330            BloomFilterPropertiesBuilder::new().build(),
2331            BloomFilterProperties::default()
2332        );
2333    }
2334
2335    #[test]
2336    fn test_bloom_filter_builder_explicit_fpp() {
2337        let props = BloomFilterProperties::builder().with_fpp(0.01).build();
2338        assert_eq!(props.fpp, 0.01);
2339        assert_eq!(props.ndv, DEFAULT_BLOOM_FILTER_NDV);
2340    }
2341
2342    #[test]
2343    fn test_bloom_filter_builder_explicit_ndv() {
2344        let props = BloomFilterProperties::builder().with_max_ndv(1000).build();
2345        assert_eq!(props.fpp, DEFAULT_BLOOM_FILTER_FPP);
2346        assert_eq!(props.ndv, 1000);
2347    }
2348
2349    #[test]
2350    fn test_bloom_filter_builder_validates_fpp() {
2351        for wrong_val in [0.0_f64, 1.0, -0.5, 2.0] {
2352            let result = std::panic::catch_unwind(|| {
2353                BloomFilterProperties::builder().with_fpp(wrong_val).build()
2354            });
2355            assert!(
2356                result.is_err(),
2357                "with_fpp({wrong_val}).build() should reject value outside (0, 1)"
2358            );
2359        }
2360    }
2361
2362    #[test]
2363    fn test_bloom_filter_builder_try_build_validates_fpp() {
2364        for wrong_val in [0.0_f64, 1.0, -0.5, 2.0] {
2365            let result = BloomFilterProperties::builder()
2366                .with_fpp(wrong_val)
2367                .try_build();
2368            assert!(
2369                result.is_err(),
2370                "try_build() should return Err for fpp outside (0, 1)"
2371            );
2372        }
2373
2374        let ok = BloomFilterProperties::builder()
2375            .with_fpp(0.01)
2376            .with_max_ndv(1000)
2377            .try_build()
2378            .expect("valid fpp should yield Ok");
2379        assert_eq!(ok.fpp, 0.01);
2380        assert_eq!(ok.ndv, 1000);
2381    }
2382
2383    #[test]
2384    fn test_column_specific_implicit_ndv_uses_row_group_size() {
2385        let custom_row_group_size: usize = 7777;
2386        let col = ColumnPath::from("col");
2387        let props = WriterProperties::builder()
2388            .set_max_row_group_row_count(Some(custom_row_group_size))
2389            .set_column_bloom_filter_enabled(col.clone(), true)
2390            .build();
2391        let bf = props
2392            .bloom_filter_properties(&col)
2393            .expect("bloom filter should be enabled for col");
2394
2395        assert_eq!(bf.ndv, custom_row_group_size as u64);
2396        assert_eq!(bf.fpp, DEFAULT_BLOOM_FILTER_FPP);
2397    }
2398
2399    #[test]
2400    fn test_set_bloom_filter_properties_applied_globally() {
2401        let bf = BloomFilterProperties::builder()
2402            .with_fpp(0.01)
2403            .with_max_ndv(500)
2404            .build();
2405        let props = WriterProperties::builder()
2406            .set_bloom_filter_properties(bf.clone())
2407            .build();
2408
2409        assert_eq!(
2410            props.bloom_filter_properties(&ColumnPath::from("a")),
2411            Some(&bf),
2412        );
2413        assert_eq!(
2414            props.bloom_filter_properties(&ColumnPath::from("b")),
2415            Some(&bf),
2416        );
2417    }
2418
2419    #[test]
2420    fn test_set_column_bloom_filter_properties_overrides_global() {
2421        let global = BloomFilterProperties::builder()
2422            .with_fpp(0.01)
2423            .with_max_ndv(500)
2424            .build();
2425        let tailored = BloomFilterProperties::builder()
2426            .with_fpp(0.02)
2427            .with_max_ndv(1000)
2428            .build();
2429
2430        let col = ColumnPath::from("col");
2431        let props = WriterProperties::builder()
2432            .set_bloom_filter_properties(global.clone())
2433            .set_column_bloom_filter_properties(col.clone(), tailored.clone())
2434            .build();
2435
2436        assert_eq!(props.bloom_filter_properties(&col), Some(&tailored));
2437        assert_eq!(
2438            props.bloom_filter_properties(&ColumnPath::from("other")),
2439            Some(&global)
2440        );
2441    }
2442
2443    #[test]
2444    fn test_set_bloom_filter_properties_preserve_explicit_ndv() {
2445        let bf = BloomFilterProperties::builder().with_max_ndv(42).build();
2446        let props = WriterProperties::builder()
2447            .set_max_row_group_row_count(Some(99_999))
2448            .set_bloom_filter_properties(bf)
2449            .build();
2450        let result = props
2451            .bloom_filter_properties(&ColumnPath::from("col"))
2452            .expect("bloom filter should be enabled");
2453
2454        assert_eq!(
2455            result.ndv, 42,
2456            "explicit ndv must not be overridden by row-group-size fallback"
2457        );
2458    }
2459}