Skip to main content

parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_row_count`]
47pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`].
57///
58/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
59/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
60/// but without an explicit NDV will have their NDV resolved at build time to
61/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
62/// if the user configured a custom row group size.
63pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
64/// Default values for [`WriterProperties::statistics_truncate_length`]
65pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
66/// Default value for [`WriterProperties::offset_index_disabled`]
67pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
68/// Default values for [`WriterProperties::coerce_types`]
69pub const DEFAULT_COERCE_TYPES: bool = false;
70/// Default minimum chunk size for content-defined chunking: 256 KiB.
71pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
72/// Default maximum chunk size for content-defined chunking: 1024 KiB.
73pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024;
74/// Default normalization level for content-defined chunking.
75pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0;
76
77/// EXPERIMENTAL: Options for content-defined chunking (CDC).
78///
79/// Content-defined chunking is an experimental feature that optimizes parquet
80/// files for content addressable storage (CAS) systems by writing data pages
81/// according to content-defined chunk boundaries. This allows for more
82/// efficient deduplication of data across files, hence more efficient network
83/// transfers and storage.
84///
85/// Each content-defined chunk is written as a separate parquet data page. The
86/// following options control the chunks' size and the chunking process. Note
87/// that the chunk size is calculated based on the logical value of the data,
88/// before any encoding or compression is applied.
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub struct CdcOptions {
91    /// Minimum chunk size in bytes, default is 256 KiB.
92    /// The rolling hash will not be updated until this size is reached for each chunk.
93    /// Note that all data sent through the hash function is counted towards the chunk
94    /// size, including definition and repetition levels if present.
95    pub min_chunk_size: usize,
96    /// Maximum chunk size in bytes, default is 1024 KiB.
97    /// The chunker will create a new chunk whenever the chunk size exceeds this value.
98    /// Note that the parquet writer has a related [`data_page_size_limit`] property that
99    /// controls the maximum size of a parquet data page after encoding. While setting
100    /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect
101    /// the chunking effectiveness, it results in more small parquet data pages.
102    ///
103    /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit
104    pub max_chunk_size: usize,
105    /// Number of bit adjustment to the gearhash mask in order to center the chunk size
106    /// around the average size more aggressively, default is 0.
107    /// Increasing the normalization level increases the probability of finding a chunk,
108    /// improving the deduplication ratio, but also increasing the number of small chunks
109    /// resulting in many small parquet data pages. The default value provides a good
110    /// balance between deduplication ratio and fragmentation.
111    /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
112    /// expense of fragmentation. Negative values can also be used to reduce the
113    /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
114    /// Note that values outside [-3, 3] are not recommended, prefer using the default
115    /// value of 0 for most use cases.
116    pub norm_level: i32,
117}
118
119impl Default for CdcOptions {
120    fn default() -> Self {
121        Self {
122            min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE,
123            max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE,
124            norm_level: DEFAULT_CDC_NORM_LEVEL,
125        }
126    }
127}
128
129/// Parquet writer version.
130///
131/// Basic constant, which is not part of the Thrift definition.
132#[derive(Debug, Clone, Copy, PartialEq, Eq)]
133#[allow(non_camel_case_types)]
134pub enum WriterVersion {
135    /// Parquet format version 1.0
136    PARQUET_1_0,
137    /// Parquet format version 2.0
138    PARQUET_2_0,
139}
140
141impl WriterVersion {
142    /// Returns writer version as `i32`.
143    pub fn as_num(&self) -> i32 {
144        match self {
145            WriterVersion::PARQUET_1_0 => 1,
146            WriterVersion::PARQUET_2_0 => 2,
147        }
148    }
149}
150
151impl FromStr for WriterVersion {
152    type Err = String;
153
154    fn from_str(s: &str) -> Result<Self, Self::Err> {
155        match s {
156            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
157            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
158            _ => Err(format!("Invalid writer version: {s}")),
159        }
160    }
161}
162
163/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
164/// write Bloom filters
165///
166/// Basic constant, which is not part of the Thrift definition.
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168pub enum BloomFilterPosition {
169    /// Write Bloom Filters of each row group right after the row group
170    ///
171    /// This saves memory by writing it as soon as it is computed, at the cost
172    /// of data locality for readers
173    AfterRowGroup,
174    /// Write Bloom Filters at the end of the file
175    ///
176    /// This allows better data locality for readers, at the cost of memory usage
177    /// for writers.
178    End,
179}
180
181/// Reference counted writer properties.
182pub type WriterPropertiesPtr = Arc<WriterProperties>;
183
184/// Configuration settings for writing parquet files.
185///
186/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
187///
188/// # Example
189///
190/// ```rust
191/// # use parquet::{
192/// #    basic::{Compression, Encoding},
193/// #    file::properties::*,
194/// #    schema::types::ColumnPath,
195/// # };
196/// #
197/// // Create properties with default configuration.
198/// let props = WriterProperties::default();
199///
200/// // Use properties builder to set certain options and assemble the configuration.
201/// let props = WriterProperties::builder()
202///     .set_writer_version(WriterVersion::PARQUET_1_0)
203///     .set_encoding(Encoding::PLAIN)
204///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
205///     .set_compression(Compression::SNAPPY)
206///     .build();
207///
208/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
209/// assert_eq!(
210///     props.encoding(&ColumnPath::from("col1")),
211///     Some(Encoding::DELTA_BINARY_PACKED)
212/// );
213/// assert_eq!(
214///     props.encoding(&ColumnPath::from("col2")),
215///     Some(Encoding::PLAIN)
216/// );
217/// ```
218#[derive(Debug, Clone)]
219pub struct WriterProperties {
220    data_page_row_count_limit: usize,
221    write_batch_size: usize,
222    max_row_group_row_count: Option<usize>,
223    max_row_group_bytes: Option<usize>,
224    bloom_filter_position: BloomFilterPosition,
225    writer_version: WriterVersion,
226    created_by: String,
227    offset_index_disabled: bool,
228    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
229    default_column_properties: ColumnProperties,
230    column_properties: HashMap<ColumnPath, ColumnProperties>,
231    sorting_columns: Option<Vec<SortingColumn>>,
232    column_index_truncate_length: Option<usize>,
233    statistics_truncate_length: Option<usize>,
234    coerce_types: bool,
235    content_defined_chunking: Option<CdcOptions>,
236    #[cfg(feature = "encryption")]
237    pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
238}
239
240impl Default for WriterProperties {
241    fn default() -> Self {
242        Self::builder().build()
243    }
244}
245
246impl WriterProperties {
247    /// Create a new [`WriterProperties`] with the default settings
248    ///
249    /// See [`WriterProperties::builder`] for customising settings
250    pub fn new() -> Self {
251        Self::default()
252    }
253
254    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
255    /// properties.
256    pub fn builder() -> WriterPropertiesBuilder {
257        WriterPropertiesBuilder::default()
258    }
259
260    /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
261    /// Used for mutating existing property settings
262    pub fn into_builder(self) -> WriterPropertiesBuilder {
263        self.into()
264    }
265
266    /// Returns data page size limit.
267    ///
268    /// Note: this is a best effort limit based on the write batch size
269    ///
270    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
271    pub fn data_page_size_limit(&self) -> usize {
272        self.default_column_properties
273            .data_page_size_limit()
274            .unwrap_or(DEFAULT_PAGE_SIZE)
275    }
276
277    /// Returns data page size limit for a specific column.
278    ///
279    /// Takes precedence over [`Self::data_page_size_limit`].
280    ///
281    /// Note: this is a best effort limit based on the write batch size.
282    pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
283        self.column_properties
284            .get(col)
285            .and_then(|c| c.data_page_size_limit())
286            .or_else(|| self.default_column_properties.data_page_size_limit())
287            .unwrap_or(DEFAULT_PAGE_SIZE)
288    }
289
290    /// Returns dictionary page size limit.
291    ///
292    /// Note: this is a best effort limit based on the write batch size
293    ///
294    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
295    pub fn dictionary_page_size_limit(&self) -> usize {
296        self.default_column_properties
297            .dictionary_page_size_limit()
298            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
299    }
300
301    /// Returns dictionary page size limit for a specific column.
302    pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
303        self.column_properties
304            .get(col)
305            .and_then(|c| c.dictionary_page_size_limit())
306            .or_else(|| self.default_column_properties.dictionary_page_size_limit())
307            .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
308    }
309
310    /// Returns the maximum page row count
311    ///
312    /// Note: this is a best effort limit based on the write batch size
313    ///
314    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
315    pub fn data_page_row_count_limit(&self) -> usize {
316        self.data_page_row_count_limit
317    }
318
319    /// Returns configured batch size for writes.
320    ///
321    /// When writing a batch of data, this setting allows to split it internally into
322    /// smaller batches so we can better estimate the size of a page currently being
323    /// written.
324    ///
325    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
326    pub fn write_batch_size(&self) -> usize {
327        self.write_batch_size
328    }
329
330    /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
331    ///
332    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
333    #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
334    pub fn max_row_group_size(&self) -> usize {
335        self.max_row_group_row_count.unwrap_or(usize::MAX)
336    }
337
338    /// Returns maximum number of rows in a row group, or `None` if unlimited.
339    ///
340    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
341    pub fn max_row_group_row_count(&self) -> Option<usize> {
342        self.max_row_group_row_count
343    }
344
345    /// Returns maximum size of a row group in bytes, or `None` if unlimited.
346    ///
347    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
348    pub fn max_row_group_bytes(&self) -> Option<usize> {
349        self.max_row_group_bytes
350    }
351
352    /// Returns bloom filter position.
353    ///
354    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
355    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
356        self.bloom_filter_position
357    }
358
359    /// Returns configured writer version.
360    ///
361    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
362    pub fn writer_version(&self) -> WriterVersion {
363        self.writer_version
364    }
365
366    /// Returns `created_by` string.
367    ///
368    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
369    pub fn created_by(&self) -> &str {
370        &self.created_by
371    }
372
373    /// Returns `true` if offset index writing is disabled.
374    ///
375    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
376    pub fn offset_index_disabled(&self) -> bool {
377        // If page statistics are to be collected, then do not disable the offset indexes.
378        let default_page_stats_enabled =
379            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
380        let column_page_stats_enabled = self
381            .column_properties
382            .iter()
383            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
384        if default_page_stats_enabled || column_page_stats_enabled {
385            return false;
386        }
387
388        self.offset_index_disabled
389    }
390
391    /// Returns `key_value_metadata` KeyValue pairs.
392    ///
393    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
394    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
395        self.key_value_metadata.as_ref()
396    }
397
398    /// Returns sorting columns.
399    ///
400    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
401    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
402        self.sorting_columns.as_ref()
403    }
404
405    /// Returns the maximum length of truncated min/max values in the column index.
406    ///
407    /// `None` if truncation is disabled, must be greater than 0 otherwise.
408    ///
409    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
410    pub fn column_index_truncate_length(&self) -> Option<usize> {
411        self.column_index_truncate_length
412    }
413
414    /// Returns the maximum length of truncated min/max values in [`Statistics`].
415    ///
416    /// `None` if truncation is disabled, must be greater than 0 otherwise.
417    ///
418    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
419    ///
420    /// [`Statistics`]: crate::file::statistics::Statistics
421    pub fn statistics_truncate_length(&self) -> Option<usize> {
422        self.statistics_truncate_length
423    }
424
425    /// Returns `true` if type coercion is enabled.
426    ///
427    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
428    pub fn coerce_types(&self) -> bool {
429        self.coerce_types
430    }
431
432    /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
433    ///
434    /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
435    pub fn content_defined_chunking(&self) -> Option<&CdcOptions> {
436        self.content_defined_chunking.as_ref()
437    }
438
439    /// Returns encoding for a data page, when dictionary encoding is enabled.
440    ///
441    /// This is not configurable.
442    #[inline]
443    pub fn dictionary_data_page_encoding(&self) -> Encoding {
444        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
445        // Dictionary values are encoded using RLE_DICTIONARY encoding.
446        Encoding::RLE_DICTIONARY
447    }
448
449    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
450    ///
451    /// This is not configurable.
452    #[inline]
453    pub fn dictionary_page_encoding(&self) -> Encoding {
454        // PLAIN_DICTIONARY is deprecated in writer version 1.
455        // Dictionary is encoded using plain encoding.
456        Encoding::PLAIN
457    }
458
459    /// Returns encoding for a column, if set.
460    ///
461    /// In case when dictionary is enabled, returns fallback encoding.
462    ///
463    /// If encoding is not set, then column writer will choose the best encoding
464    /// based on the column type.
465    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
466        self.column_properties
467            .get(col)
468            .and_then(|c| c.encoding())
469            .or_else(|| self.default_column_properties.encoding())
470    }
471
472    /// Returns compression codec for a column.
473    ///
474    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
475    pub fn compression(&self, col: &ColumnPath) -> Compression {
476        self.column_properties
477            .get(col)
478            .and_then(|c| c.compression())
479            .or_else(|| self.default_column_properties.compression())
480            .unwrap_or(DEFAULT_COMPRESSION)
481    }
482
483    /// Returns `true` if dictionary encoding is enabled for a column.
484    ///
485    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
486    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
487        self.column_properties
488            .get(col)
489            .and_then(|c| c.dictionary_enabled())
490            .or_else(|| self.default_column_properties.dictionary_enabled())
491            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
492    }
493
494    /// Returns which statistics are written for a column.
495    ///
496    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
497    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
498        self.column_properties
499            .get(col)
500            .and_then(|c| c.statistics_enabled())
501            .or_else(|| self.default_column_properties.statistics_enabled())
502            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
503    }
504
505    /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
506    ///
507    /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
508    ///
509    /// [`Statistics`]: crate::file::statistics::Statistics
510    pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
511        self.column_properties
512            .get(col)
513            .and_then(|c| c.write_page_header_statistics())
514            .or_else(|| {
515                self.default_column_properties
516                    .write_page_header_statistics()
517            })
518            .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
519    }
520
521    /// Returns the [`BloomFilterProperties`] for the given column
522    ///
523    /// Returns `None` if bloom filter is disabled
524    ///
525    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
526    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
527        self.column_properties
528            .get(col)
529            .and_then(|c| c.bloom_filter_properties())
530            .or_else(|| self.default_column_properties.bloom_filter_properties())
531    }
532
533    /// Return file encryption properties
534    ///
535    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
536    #[cfg(feature = "encryption")]
537    pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
538        self.file_encryption_properties.as_ref()
539    }
540}
541
542/// Builder for  [`WriterProperties`] Parquet writer configuration.
543///
544/// See example on [`WriterProperties`]
545#[derive(Debug, Clone)]
546pub struct WriterPropertiesBuilder {
547    data_page_row_count_limit: usize,
548    write_batch_size: usize,
549    max_row_group_row_count: Option<usize>,
550    max_row_group_bytes: Option<usize>,
551    bloom_filter_position: BloomFilterPosition,
552    writer_version: WriterVersion,
553    created_by: String,
554    offset_index_disabled: bool,
555    key_value_metadata: Option<Vec<KeyValue>>,
556    default_column_properties: ColumnProperties,
557    column_properties: HashMap<ColumnPath, ColumnProperties>,
558    sorting_columns: Option<Vec<SortingColumn>>,
559    column_index_truncate_length: Option<usize>,
560    statistics_truncate_length: Option<usize>,
561    coerce_types: bool,
562    content_defined_chunking: Option<CdcOptions>,
563    #[cfg(feature = "encryption")]
564    file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
565}
566
567impl Default for WriterPropertiesBuilder {
568    /// Returns default state of the builder.
569    fn default() -> Self {
570        Self {
571            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
572            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
573            max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
574            max_row_group_bytes: None,
575            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
576            writer_version: DEFAULT_WRITER_VERSION,
577            created_by: DEFAULT_CREATED_BY.to_string(),
578            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
579            key_value_metadata: None,
580            default_column_properties: Default::default(),
581            column_properties: HashMap::new(),
582            sorting_columns: None,
583            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
584            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
585            coerce_types: DEFAULT_COERCE_TYPES,
586            content_defined_chunking: None,
587            #[cfg(feature = "encryption")]
588            file_encryption_properties: None,
589        }
590    }
591}
592
593impl WriterPropertiesBuilder {
594    /// Finalizes the configuration and returns immutable writer properties struct.
595    pub fn build(self) -> WriterProperties {
596        // Resolve bloom filter NDV for columns where it wasn't explicitly set:
597        // default to max_row_group_row_count so the filter is never undersized.
598        let default_ndv = self
599            .max_row_group_row_count
600            .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
601        let mut default_column_properties = self.default_column_properties;
602        default_column_properties.resolve_bloom_filter_ndv(default_ndv);
603        let mut column_properties = self.column_properties;
604        for props in column_properties.values_mut() {
605            props.resolve_bloom_filter_ndv(default_ndv);
606        }
607
608        WriterProperties {
609            data_page_row_count_limit: self.data_page_row_count_limit,
610            write_batch_size: self.write_batch_size,
611            max_row_group_row_count: self.max_row_group_row_count,
612            max_row_group_bytes: self.max_row_group_bytes,
613            bloom_filter_position: self.bloom_filter_position,
614            writer_version: self.writer_version,
615            created_by: self.created_by,
616            offset_index_disabled: self.offset_index_disabled,
617            key_value_metadata: self.key_value_metadata,
618            default_column_properties,
619            column_properties,
620            sorting_columns: self.sorting_columns,
621            column_index_truncate_length: self.column_index_truncate_length,
622            statistics_truncate_length: self.statistics_truncate_length,
623            coerce_types: self.coerce_types,
624            content_defined_chunking: self.content_defined_chunking,
625            #[cfg(feature = "encryption")]
626            file_encryption_properties: self.file_encryption_properties,
627        }
628    }
629
630    // ----------------------------------------------------------------------
631    // Writer properties related to a file
632
633    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
634    /// via [`DEFAULT_WRITER_VERSION`])
635    ///
636    /// This value can determine what features some readers will support.
637    ///
638    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
639    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
640        self.writer_version = value;
641        self
642    }
643
644    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
645    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
646    ///
647    /// The parquet writer will attempt to limit the number of rows in
648    /// each `DataPage` to this value. Reducing this value will result
649    /// in larger parquet files, but may improve the effectiveness of
650    /// page index based predicate pushdown during reading.
651    ///
652    /// Note: this is a best effort limit based on value of
653    /// [`set_write_batch_size`](Self::set_write_batch_size).
654    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
655        self.data_page_row_count_limit = value;
656        self
657    }
658
659    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
660    ///
661    /// For performance reasons, data for each column is written in
662    /// batches of this size.
663    ///
664    /// Additional limits such as such as
665    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
666    /// are checked between batches, and thus the write batch size value acts as an
667    /// upper-bound on the enforcement granularity of other limits.
668    pub fn set_write_batch_size(mut self, value: usize) -> Self {
669        self.write_batch_size = value;
670        self
671    }
672
673    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
674    /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
675    ///
676    /// # Panics
677    /// If the value is set to 0.
678    #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
679    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
680        assert!(value > 0, "Cannot have a 0 max row group size");
681        self.max_row_group_row_count = Some(value);
682        self
683    }
684
685    /// Sets maximum number of rows in a row group, or `None` for unlimited.
686    ///
687    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
688    /// the row group with the smaller limit will be produced.
689    ///
690    /// # Panics
691    /// If the value is `Some(0)`.
692    pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
693        assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
694        self.max_row_group_row_count = value;
695        self
696    }
697
698    /// Sets maximum size of a row group in bytes, or `None` for unlimited.
699    ///
700    /// Row groups are flushed when their estimated encoded size exceeds this threshold.
701    /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
702    ///
703    /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
704    /// the row group with the smaller limit will be produced.
705    ///
706    /// # Panics
707    /// If the value is `Some(0)`.
708    pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
709        assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
710        self.max_row_group_bytes = value;
711        self
712    }
713
714    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
715    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
716    ///
717    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
718    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
719        self.bloom_filter_position = value;
720        self
721    }
722
723    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
724    /// [`DEFAULT_CREATED_BY`]).
725    ///
726    /// This is a string that will be written into the file metadata
727    pub fn set_created_by(mut self, value: String) -> Self {
728        self.created_by = value;
729        self
730    }
731
732    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
733    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
734    ///
735    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
736    ///
737    /// Note: As the offset indexes are useful for accessing data by row number,
738    /// they are always written by default, regardless of whether other statistics
739    /// are enabled. Disabling this metadata may result in a degradation in read
740    /// performance, so use this option with care.
741    ///
742    /// [`Page`]: EnabledStatistics::Page
743    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
744        self.offset_index_disabled = value;
745        self
746    }
747
748    /// Sets "key_value_metadata" property (defaults to `None`).
749    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
750        self.key_value_metadata = value;
751        self
752    }
753
754    /// Sets sorting order of rows in the row group if any (defaults to `None`).
755    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
756        self.sorting_columns = value;
757        self
758    }
759
760    /// Sets the max length of min/max value fields when writing the column
761    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
762    ///
763    /// This can be used to prevent columns with very long values (hundreds of
764    /// bytes long) from causing the parquet metadata to become huge.
765    ///
766    /// # Notes
767    ///
768    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
769    /// set to [`EnabledStatistics::Page`].
770    ///
771    /// * If `Some`, must be greater than 0, otherwise will panic
772    /// * If `None`, there's no effective limit.
773    ///
774    /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
775    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
776        if let Some(value) = max_length {
777            assert!(
778                value > 0,
779                "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
780            );
781        }
782
783        self.column_index_truncate_length = max_length;
784        self
785    }
786
787    /// Sets the max length of min/max value fields in row group and data page header
788    /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
789    ///
790    /// # Notes
791    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
792    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
793    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
794    /// [`EnabledStatistics::Page`].
795    ///
796    /// * If `Some`, must be greater than 0, otherwise will panic
797    /// * If `None`, there's no effective limit.
798    ///
799    /// # See also
800    /// Truncation of Page Index statistics is controlled separately via
801    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
802    ///
803    /// [`Statistics`]: crate::file::statistics::Statistics
804    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
805        if let Some(value) = max_length {
806            assert!(
807                value > 0,
808                "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
809            );
810        }
811
812        self.statistics_truncate_length = max_length;
813        self
814    }
815
816    /// Should the writer coerce types to parquet native types (defaults to `false` via
817    /// [`DEFAULT_COERCE_TYPES`]).
818    ///
819    /// Leaving this option the default `false` will ensure the exact same data
820    /// written to parquet using this library will be read.
821    ///
822    /// Setting this option to `true` will result in parquet files that can be
823    /// read by more readers, but potentially lose information in the process.
824    ///
825    /// * Types such as [`DataType::Date64`], which have no direct corresponding
826    ///   Parquet type, may be stored with lower precision.
827    ///
828    /// * The internal field names of `List` and `Map` types will be renamed if
829    ///   necessary to match what is required by the newest Parquet specification.
830    ///
831    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
832    ///
833    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
834    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
835    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
836        self.coerce_types = coerce_types;
837        self
838    }
839
840    /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
841    ///
842    /// When enabled, data page boundaries are determined by a rolling hash of the
843    /// column values, so unchanged data produces identical byte sequences across
844    /// file versions. This enables efficient deduplication on content-addressable
845    /// storage systems.
846    ///
847    /// Only supported through the Arrow writer interface ([`ArrowWriter`]).
848    ///
849    /// # Panics
850    ///
851    /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`.
852    ///
853    /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter
854    pub fn set_content_defined_chunking(mut self, options: Option<CdcOptions>) -> Self {
855        if let Some(ref options) = options {
856            assert!(
857                options.min_chunk_size > 0,
858                "min_chunk_size must be positive"
859            );
860            assert!(
861                options.max_chunk_size > options.min_chunk_size,
862                "max_chunk_size ({}) must be greater than min_chunk_size ({})",
863                options.max_chunk_size,
864                options.min_chunk_size
865            );
866        }
867        self.content_defined_chunking = options;
868        self
869    }
870
871    /// Sets FileEncryptionProperties (defaults to `None`)
872    #[cfg(feature = "encryption")]
873    pub fn with_file_encryption_properties(
874        mut self,
875        file_encryption_properties: Arc<FileEncryptionProperties>,
876    ) -> Self {
877        self.file_encryption_properties = Some(file_encryption_properties);
878        self
879    }
880
881    // ----------------------------------------------------------------------
882    // Setters for any column (global)
883
884    /// Sets default encoding for all columns.
885    ///
886    /// If dictionary is not enabled, this is treated as a primary encoding for all
887    /// columns. In case when dictionary is enabled for any column, this value is
888    /// considered to be a fallback encoding for that column.
889    ///
890    /// # Panics
891    ///
892    /// if dictionary encoding is specified, regardless of dictionary
893    /// encoding flag being set.
894    pub fn set_encoding(mut self, value: Encoding) -> Self {
895        self.default_column_properties.set_encoding(value);
896        self
897    }
898
899    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
900    /// [`DEFAULT_COMPRESSION`]).
901    ///
902    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
903    pub fn set_compression(mut self, value: Compression) -> Self {
904        self.default_column_properties.set_compression(value);
905        self
906    }
907
908    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
909    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
910    ///
911    /// Use this method to set dictionary encoding, instead of explicitly specifying
912    /// encoding in `set_encoding` method.
913    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
914        self.default_column_properties.set_dictionary_enabled(value);
915        self
916    }
917
918    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
919    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
920    ///
921    /// The parquet writer will attempt to limit the size of each
922    /// `DataPage` used to store dictionaries to this many
923    /// bytes. Reducing this value will result in larger parquet
924    /// files, but may improve the effectiveness of page index based
925    /// predicate pushdown during reading.
926    ///
927    /// Note: this is a best effort limit based on value of
928    /// [`set_write_batch_size`](Self::set_write_batch_size).
929    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
930        self.default_column_properties
931            .set_dictionary_page_size_limit(value);
932        self
933    }
934
935    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
936    /// via [`DEFAULT_PAGE_SIZE`]).
937    ///
938    /// The parquet writer will attempt to limit the sizes of each
939    /// `DataPage` to this many bytes. Reducing this value will result
940    /// in larger parquet files, but may improve the effectiveness of
941    /// page index based predicate pushdown during reading.
942    ///
943    /// Note: this is a best effort limit based on value of
944    /// [`set_write_batch_size`](Self::set_write_batch_size).
945    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
946        self.default_column_properties
947            .set_data_page_size_limit(value);
948        self
949    }
950
951    /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
952    /// [`DEFAULT_STATISTICS_ENABLED`]).
953    ///
954    /// [`Page`]: EnabledStatistics::Page
955    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
956        self.default_column_properties.set_statistics_enabled(value);
957        self
958    }
959
960    /// enable/disable writing [`Statistics`] in the page header
961    /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
962    ///
963    /// Only applicable if [`Page`] level statistics are gathered.
964    ///
965    /// Setting this value to `true` can greatly increase the size of the resulting Parquet
966    /// file while yielding very little added benefit. Most modern Parquet implementations
967    /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
968    /// those in the page header.
969    ///
970    /// # Note
971    ///
972    /// Prior to version 56.0.0, the `parquet` crate always wrote these
973    /// statistics (the equivalent of setting this option to `true`). This was
974    /// changed in 56.0.0 to follow the recommendation in the Parquet
975    /// specification. See [issue #7580] for more details.
976    ///
977    /// [`Statistics`]: crate::file::statistics::Statistics
978    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
979    /// [`Page`]: EnabledStatistics::Page
980    /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
981    pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
982        self.default_column_properties
983            .set_write_page_header_statistics(value);
984        self
985    }
986
987    /// Sets if bloom filter should be written for all columns (defaults to `false`).
988    ///
989    /// # Notes
990    ///
991    /// * If the bloom filter is enabled previously then it is a no-op.
992    ///
993    /// * If the bloom filter is not enabled, default values for ndv and fpp
994    ///   value are used used. See [`set_bloom_filter_ndv`] and
995    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
996    ///
997    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
998    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
999    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
1000        self.default_column_properties
1001            .set_bloom_filter_enabled(value);
1002        self
1003    }
1004
1005    /// Sets the default target bloom filter false positive probability (fpp)
1006    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
1007    ///
1008    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1009    /// been called.
1010    ///
1011    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1012    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
1013        self.default_column_properties.set_bloom_filter_fpp(value);
1014        self
1015    }
1016
1017    /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1018    /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
1019    ///
1020    /// The bloom filter is initially sized for this many distinct values at the
1021    /// configured FPP, then folded down after all values are inserted to achieve
1022    /// optimal size. A good heuristic is to set this to the expected number of rows
1023    /// in the row group.
1024    ///
1025    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1026    /// been called.
1027    ///
1028    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1029    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
1030        self.default_column_properties.set_bloom_filter_ndv(value);
1031        self
1032    }
1033
1034    // ----------------------------------------------------------------------
1035    // Setters for a specific column
1036
1037    /// Helper method to get existing or new mutable reference of column properties.
1038    #[inline]
1039    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
1040        self.column_properties.entry(col).or_default()
1041    }
1042
1043    /// Sets encoding for a specific column.
1044    ///
1045    /// Takes precedence over [`Self::set_encoding`].
1046    ///
1047    /// If dictionary is not enabled, this is treated as a primary encoding for this
1048    /// column. In case when dictionary is enabled for this column, either through
1049    /// global defaults or explicitly, this value is considered to be a fallback
1050    /// encoding for this column.
1051    ///
1052    /// # Panics
1053    /// If user tries to set dictionary encoding here, regardless of dictionary
1054    /// encoding flag being set.
1055    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
1056        self.get_mut_props(col).set_encoding(value);
1057        self
1058    }
1059
1060    /// Sets compression codec for a specific column.
1061    ///
1062    /// Takes precedence over [`Self::set_compression`].
1063    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
1064        self.get_mut_props(col).set_compression(value);
1065        self
1066    }
1067
1068    /// Sets flag to enable/disable dictionary encoding for a specific column.
1069    ///
1070    /// Takes precedence over [`Self::set_dictionary_enabled`].
1071    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1072        self.get_mut_props(col).set_dictionary_enabled(value);
1073        self
1074    }
1075
1076    /// Sets dictionary page size limit for a specific column.
1077    ///
1078    /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
1079    pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1080        self.get_mut_props(col)
1081            .set_dictionary_page_size_limit(value);
1082        self
1083    }
1084
1085    /// Sets data page size limit for a specific column.
1086    ///
1087    /// Takes precedence over [`Self::set_data_page_size_limit`].
1088    pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1089        self.get_mut_props(col).set_data_page_size_limit(value);
1090        self
1091    }
1092
1093    /// Sets [`EnabledStatistics`] level for a specific column.
1094    ///
1095    /// Takes precedence over [`Self::set_statistics_enabled`].
1096    pub fn set_column_statistics_enabled(
1097        mut self,
1098        col: ColumnPath,
1099        value: EnabledStatistics,
1100    ) -> Self {
1101        self.get_mut_props(col).set_statistics_enabled(value);
1102        self
1103    }
1104
1105    /// Sets whether to write [`Statistics`] in the page header for a specific column.
1106    ///
1107    /// Takes precedence over [`Self::set_write_page_header_statistics`].
1108    ///
1109    /// [`Statistics`]: crate::file::statistics::Statistics
1110    pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
1111        self.get_mut_props(col)
1112            .set_write_page_header_statistics(value);
1113        self
1114    }
1115
1116    /// Sets whether a bloom filter should be written for a specific column.
1117    ///
1118    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
1119    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1120        self.get_mut_props(col).set_bloom_filter_enabled(value);
1121        self
1122    }
1123
1124    /// Sets the false positive probability for bloom filter for a specific column.
1125    ///
1126    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1127    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1128        self.get_mut_props(col).set_bloom_filter_fpp(value);
1129        self
1130    }
1131
1132    /// Sets the number of distinct values for bloom filter for a specific column.
1133    ///
1134    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
1135    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1136        self.get_mut_props(col).set_bloom_filter_ndv(value);
1137        self
1138    }
1139}
1140
1141impl From<WriterProperties> for WriterPropertiesBuilder {
1142    fn from(props: WriterProperties) -> Self {
1143        WriterPropertiesBuilder {
1144            data_page_row_count_limit: props.data_page_row_count_limit,
1145            write_batch_size: props.write_batch_size,
1146            max_row_group_row_count: props.max_row_group_row_count,
1147            max_row_group_bytes: props.max_row_group_bytes,
1148            bloom_filter_position: props.bloom_filter_position,
1149            writer_version: props.writer_version,
1150            created_by: props.created_by,
1151            offset_index_disabled: props.offset_index_disabled,
1152            key_value_metadata: props.key_value_metadata,
1153            default_column_properties: props.default_column_properties,
1154            column_properties: props.column_properties,
1155            sorting_columns: props.sorting_columns,
1156            column_index_truncate_length: props.column_index_truncate_length,
1157            statistics_truncate_length: props.statistics_truncate_length,
1158            coerce_types: props.coerce_types,
1159            content_defined_chunking: props.content_defined_chunking,
1160            #[cfg(feature = "encryption")]
1161            file_encryption_properties: props.file_encryption_properties,
1162        }
1163    }
1164}
1165
1166/// Controls the level of statistics to be computed by the writer and stored in
1167/// the parquet file.
1168///
1169/// Enabling statistics makes the resulting Parquet file larger and requires
1170/// more time to read the parquet footer.
1171///
1172/// Statistics can be used to improve query performance by pruning row groups
1173/// and pages during query execution if the query engine supports evaluating the
1174/// predicate using the statistics.
1175#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1176pub enum EnabledStatistics {
1177    /// Compute no statistics.
1178    None,
1179    /// Compute column chunk-level statistics but not page-level.
1180    ///
1181    /// Setting this option will store one set of statistics for each relevant
1182    /// column for each row group. The more row groups written, the more
1183    /// statistics will be stored.
1184    Chunk,
1185    /// Compute page-level and column chunk-level statistics.
1186    ///
1187    /// Setting this option will store one set of statistics for each relevant
1188    /// column for each row group. In addition, this will enable the writing
1189    /// of the column index (the offset index is always written regardless of
1190    /// this setting). See [`ParquetColumnIndex`] for
1191    /// more information.
1192    ///
1193    /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1194    Page,
1195}
1196
1197impl FromStr for EnabledStatistics {
1198    type Err = String;
1199
1200    fn from_str(s: &str) -> Result<Self, Self::Err> {
1201        match s {
1202            "NONE" | "none" => Ok(EnabledStatistics::None),
1203            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1204            "PAGE" | "page" => Ok(EnabledStatistics::Page),
1205            _ => Err(format!("Invalid statistics arg: {s}")),
1206        }
1207    }
1208}
1209
1210impl Default for EnabledStatistics {
1211    fn default() -> Self {
1212        DEFAULT_STATISTICS_ENABLED
1213    }
1214}
1215
1216/// Controls the bloom filter to be computed by the writer.
1217///
1218/// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1219/// automatically folded down after all values are inserted to achieve optimal size while
1220/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1221/// folding algorithm.
1222///
1223/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
1224#[derive(Debug, Clone, PartialEq)]
1225pub struct BloomFilterProperties {
1226    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1227    ///
1228    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1229    ///
1230    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1231    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1232    /// e.g. 0.1, 0.05, or 0.001 is recommended.
1233    ///
1234    /// This value also serves as the target FPP for bloom filter folding: after all values
1235    /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
1236    pub fpp: f64,
1237    /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1238    ///
1239    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1240    ///
1241    /// When not explicitly set via the builder, this defaults to
1242    /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1243    /// build time). The bloom filter is initially sized for this many distinct values at the
1244    /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1245    /// is to set this to the expected number of rows in the row group. If fewer distinct values
1246    /// are actually written, the filter will be automatically compacted via folding.
1247    ///
1248    /// Thus the only negative side of overestimating this value is that the bloom filter
1249    /// will use more memory during writing than necessary, but it will not affect the final
1250    /// bloom filter size on disk.
1251    ///
1252    /// If you wish to reduce memory usage during writing and are able to make a reasonable estimate
1253    /// of the number of distinct values in a row group, it is recommended to set this value explicitly
1254    /// rather than relying on the default dynamic sizing based on `max_row_group_row_count`.
1255    /// If you do set this value explicitly it is probably best to set it for each column
1256    /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
1257    /// since different columns may have different numbers of distinct values.
1258    pub ndv: u64,
1259}
1260
1261impl Default for BloomFilterProperties {
1262    fn default() -> Self {
1263        BloomFilterProperties {
1264            fpp: DEFAULT_BLOOM_FILTER_FPP,
1265            ndv: DEFAULT_BLOOM_FILTER_NDV,
1266        }
1267    }
1268}
1269
1270/// Container for column properties that can be changed as part of writer.
1271///
1272/// If a field is `None`, it means that no specific value has been set for this column,
1273/// so some subsequent or default value must be used.
1274#[derive(Debug, Clone, Default, PartialEq)]
1275struct ColumnProperties {
1276    encoding: Option<Encoding>,
1277    codec: Option<Compression>,
1278    data_page_size_limit: Option<usize>,
1279    dictionary_page_size_limit: Option<usize>,
1280    dictionary_enabled: Option<bool>,
1281    statistics_enabled: Option<EnabledStatistics>,
1282    write_page_header_statistics: Option<bool>,
1283    /// bloom filter related properties
1284    bloom_filter_properties: Option<BloomFilterProperties>,
1285    /// Whether the bloom filter NDV was explicitly set by the user
1286    bloom_filter_ndv_is_set: bool,
1287}
1288
1289impl ColumnProperties {
1290    /// Sets encoding for this column.
1291    ///
1292    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1293    /// In case when dictionary is enabled for a column, this value is considered to
1294    /// be a fallback encoding.
1295    ///
1296    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1297    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1298    /// for a column.
1299    fn set_encoding(&mut self, value: Encoding) {
1300        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1301            panic!("Dictionary encoding can not be used as fallback encoding");
1302        }
1303        self.encoding = Some(value);
1304    }
1305
1306    /// Sets compression codec for this column.
1307    fn set_compression(&mut self, value: Compression) {
1308        self.codec = Some(value);
1309    }
1310
1311    /// Sets data page size limit for this column.
1312    fn set_data_page_size_limit(&mut self, value: usize) {
1313        self.data_page_size_limit = Some(value);
1314    }
1315
1316    /// Sets whether dictionary encoding is enabled for this column.
1317    fn set_dictionary_enabled(&mut self, enabled: bool) {
1318        self.dictionary_enabled = Some(enabled);
1319    }
1320
1321    /// Sets dictionary page size limit for this column.
1322    fn set_dictionary_page_size_limit(&mut self, value: usize) {
1323        self.dictionary_page_size_limit = Some(value);
1324    }
1325
1326    /// Sets the statistics level for this column.
1327    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1328        self.statistics_enabled = Some(enabled);
1329    }
1330
1331    /// Sets whether to write statistics in the page header for this column.
1332    fn set_write_page_header_statistics(&mut self, enabled: bool) {
1333        self.write_page_header_statistics = Some(enabled);
1334    }
1335
1336    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1337    /// otherwise it is a no-op.
1338    /// If `value` is `false`, resets bloom filter properties to `None`.
1339    fn set_bloom_filter_enabled(&mut self, value: bool) {
1340        if value && self.bloom_filter_properties.is_none() {
1341            self.bloom_filter_properties = Some(Default::default())
1342        } else if !value {
1343            self.bloom_filter_properties = None
1344        }
1345    }
1346
1347    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1348    /// bloom filter if not previously enabled.
1349    ///
1350    /// # Panics
1351    ///
1352    /// Panics if the `value` is not between 0 and 1 exclusive
1353    fn set_bloom_filter_fpp(&mut self, value: f64) {
1354        assert!(
1355            value > 0. && value < 1.0,
1356            "fpp must be between 0 and 1 exclusive, got {value}"
1357        );
1358
1359        self.bloom_filter_properties
1360            .get_or_insert_with(Default::default)
1361            .fpp = value;
1362    }
1363
1364    /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1365    /// column, and implicitly enables bloom filter if not previously enabled.
1366    fn set_bloom_filter_ndv(&mut self, value: u64) {
1367        self.bloom_filter_properties
1368            .get_or_insert_with(Default::default)
1369            .ndv = value;
1370        self.bloom_filter_ndv_is_set = true;
1371    }
1372
1373    /// Returns optional encoding for this column.
1374    fn encoding(&self) -> Option<Encoding> {
1375        self.encoding
1376    }
1377
1378    /// Returns optional compression codec for this column.
1379    fn compression(&self) -> Option<Compression> {
1380        self.codec
1381    }
1382
1383    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1384    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1385    /// been provided.
1386    fn dictionary_enabled(&self) -> Option<bool> {
1387        self.dictionary_enabled
1388    }
1389
1390    /// Returns optional dictionary page size limit for this column.
1391    fn dictionary_page_size_limit(&self) -> Option<usize> {
1392        self.dictionary_page_size_limit
1393    }
1394
1395    /// Returns optional data page size limit for this column.
1396    fn data_page_size_limit(&self) -> Option<usize> {
1397        self.data_page_size_limit
1398    }
1399
1400    /// Returns optional statistics level requested for this column. If result is `None`,
1401    /// then no setting has been provided.
1402    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1403        self.statistics_enabled
1404    }
1405
1406    /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1407    /// column.
1408    ///
1409    /// [`Statistics`]: crate::file::statistics::Statistics
1410    fn write_page_header_statistics(&self) -> Option<bool> {
1411        self.write_page_header_statistics
1412    }
1413
1414    /// Returns the bloom filter properties, or `None` if not enabled
1415    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1416        self.bloom_filter_properties.as_ref()
1417    }
1418
1419    /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1420    /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1421    fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1422        if !self.bloom_filter_ndv_is_set {
1423            if let Some(ref mut bf) = self.bloom_filter_properties {
1424                bf.ndv = default_ndv;
1425            }
1426        }
1427    }
1428}
1429
1430/// Reference counted reader properties.
1431pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1432
1433const DEFAULT_READ_BLOOM_FILTER: bool = false;
1434const DEFAULT_READ_PAGE_STATS: bool = false;
1435
1436/// Configuration settings for reading parquet files.
1437///
1438/// All properties are immutable and `Send` + `Sync`.
1439/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1440///
1441/// # Example
1442///
1443/// ```rust
1444/// use parquet::file::properties::ReaderProperties;
1445///
1446/// // Create properties with default configuration.
1447/// let props = ReaderProperties::builder().build();
1448///
1449/// // Use properties builder to set certain options and assemble the configuration.
1450/// let props = ReaderProperties::builder()
1451///     .set_backward_compatible_lz4(false)
1452///     .build();
1453/// ```
1454pub struct ReaderProperties {
1455    codec_options: CodecOptions,
1456    read_bloom_filter: bool,
1457    read_page_stats: bool,
1458}
1459
1460impl ReaderProperties {
1461    /// Returns builder for reader properties with default values.
1462    pub fn builder() -> ReaderPropertiesBuilder {
1463        ReaderPropertiesBuilder::with_defaults()
1464    }
1465
1466    /// Returns codec options.
1467    pub(crate) fn codec_options(&self) -> &CodecOptions {
1468        &self.codec_options
1469    }
1470
1471    /// Returns whether to read bloom filter
1472    pub(crate) fn read_bloom_filter(&self) -> bool {
1473        self.read_bloom_filter
1474    }
1475
1476    /// Returns whether to read page level statistics
1477    pub(crate) fn read_page_stats(&self) -> bool {
1478        self.read_page_stats
1479    }
1480}
1481
1482/// Builder for parquet file reader configuration. See example on
1483/// [`ReaderProperties`]
1484pub struct ReaderPropertiesBuilder {
1485    codec_options_builder: CodecOptionsBuilder,
1486    read_bloom_filter: Option<bool>,
1487    read_page_stats: Option<bool>,
1488}
1489
1490/// Reader properties builder.
1491impl ReaderPropertiesBuilder {
1492    /// Returns default state of the builder.
1493    fn with_defaults() -> Self {
1494        Self {
1495            codec_options_builder: CodecOptionsBuilder::default(),
1496            read_bloom_filter: None,
1497            read_page_stats: None,
1498        }
1499    }
1500
1501    /// Finalizes the configuration and returns immutable reader properties struct.
1502    pub fn build(self) -> ReaderProperties {
1503        ReaderProperties {
1504            codec_options: self.codec_options_builder.build(),
1505            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1506            read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1507        }
1508    }
1509
1510    /// Enable/disable backward compatible LZ4.
1511    ///
1512    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1513    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1514    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1515    /// compatibility with files generated by older versions of parquet-cpp.
1516    ///
1517    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1518    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1519        self.codec_options_builder = self
1520            .codec_options_builder
1521            .set_backward_compatible_lz4(value);
1522        self
1523    }
1524
1525    /// Enable/disable reading bloom filter
1526    ///
1527    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1528    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1529    ///
1530    /// By default bloom filter is set to be read.
1531    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1532        self.read_bloom_filter = Some(value);
1533        self
1534    }
1535
1536    /// Enable/disable reading page-level statistics
1537    ///
1538    /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1539    /// each page, if present.
1540    /// If set to `false`, then the reader will skip decoding the statistics.
1541    ///
1542    /// By default statistics will not be decoded.
1543    ///
1544    /// [`Statistics`]: crate::file::statistics::Statistics
1545    pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1546        self.read_page_stats = Some(value);
1547        self
1548    }
1549}
1550
1551#[cfg(test)]
1552mod tests {
1553    use super::*;
1554
1555    #[test]
1556    fn test_writer_version() {
1557        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1558        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1559    }
1560
1561    #[test]
1562    fn test_writer_properties_default_settings() {
1563        let props = WriterProperties::default();
1564        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1565        assert_eq!(
1566            props.dictionary_page_size_limit(),
1567            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1568        );
1569        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1570        assert_eq!(
1571            props.max_row_group_row_count(),
1572            Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1573        );
1574        assert_eq!(props.max_row_group_bytes(), None);
1575        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1576        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1577        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1578        assert_eq!(props.key_value_metadata(), None);
1579        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1580        assert_eq!(
1581            props.compression(&ColumnPath::from("col")),
1582            DEFAULT_COMPRESSION
1583        );
1584        assert_eq!(
1585            props.dictionary_enabled(&ColumnPath::from("col")),
1586            DEFAULT_DICTIONARY_ENABLED
1587        );
1588        assert_eq!(
1589            props.statistics_enabled(&ColumnPath::from("col")),
1590            DEFAULT_STATISTICS_ENABLED
1591        );
1592        assert!(
1593            props
1594                .bloom_filter_properties(&ColumnPath::from("col"))
1595                .is_none()
1596        );
1597    }
1598
1599    #[test]
1600    fn test_writer_properties_dictionary_encoding() {
1601        // dictionary encoding is not configurable, and it should be the same for both
1602        // writer version 1 and 2.
1603        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1604            let props = WriterProperties::builder()
1605                .set_writer_version(*version)
1606                .build();
1607            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1608            assert_eq!(
1609                props.dictionary_data_page_encoding(),
1610                Encoding::RLE_DICTIONARY
1611            );
1612        }
1613    }
1614
1615    #[test]
1616    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1617    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1618        // Should panic when user specifies dictionary encoding as fallback encoding.
1619        WriterProperties::builder()
1620            .set_encoding(Encoding::PLAIN_DICTIONARY)
1621            .build();
1622    }
1623
1624    #[test]
1625    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1626    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1627        // Should panic when user specifies dictionary encoding as fallback encoding.
1628        WriterProperties::builder()
1629            .set_encoding(Encoding::RLE_DICTIONARY)
1630            .build();
1631    }
1632
1633    #[test]
1634    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1635    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1636        WriterProperties::builder()
1637            .set_dictionary_enabled(true)
1638            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1639            .build();
1640    }
1641
1642    #[test]
1643    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1644    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1645        WriterProperties::builder()
1646            .set_dictionary_enabled(false)
1647            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1648            .build();
1649    }
1650
1651    #[test]
1652    fn test_writer_properties_builder() {
1653        let props = WriterProperties::builder()
1654            // file settings
1655            .set_writer_version(WriterVersion::PARQUET_2_0)
1656            .set_data_page_size_limit(10)
1657            .set_dictionary_page_size_limit(20)
1658            .set_write_batch_size(30)
1659            .set_max_row_group_row_count(Some(40))
1660            .set_created_by("default".to_owned())
1661            .set_key_value_metadata(Some(vec![KeyValue::new(
1662                "key".to_string(),
1663                "value".to_string(),
1664            )]))
1665            // global column settings
1666            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1667            .set_compression(Compression::GZIP(Default::default()))
1668            .set_dictionary_enabled(false)
1669            .set_statistics_enabled(EnabledStatistics::None)
1670            // specific column settings
1671            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1672            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1673            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1674            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1675            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1676            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1677            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1678            .build();
1679
1680        fn test_props(props: &WriterProperties) {
1681            assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1682            assert_eq!(props.data_page_size_limit(), 10);
1683            assert_eq!(props.dictionary_page_size_limit(), 20);
1684            assert_eq!(props.write_batch_size(), 30);
1685            assert_eq!(props.max_row_group_row_count(), Some(40));
1686            assert_eq!(props.created_by(), "default");
1687            assert_eq!(
1688                props.key_value_metadata(),
1689                Some(&vec![
1690                    KeyValue::new("key".to_string(), "value".to_string(),)
1691                ])
1692            );
1693
1694            assert_eq!(
1695                props.encoding(&ColumnPath::from("a")),
1696                Some(Encoding::DELTA_BINARY_PACKED)
1697            );
1698            assert_eq!(
1699                props.compression(&ColumnPath::from("a")),
1700                Compression::GZIP(Default::default())
1701            );
1702            assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1703            assert_eq!(
1704                props.statistics_enabled(&ColumnPath::from("a")),
1705                EnabledStatistics::None
1706            );
1707
1708            assert_eq!(
1709                props.encoding(&ColumnPath::from("col")),
1710                Some(Encoding::RLE)
1711            );
1712            assert_eq!(
1713                props.compression(&ColumnPath::from("col")),
1714                Compression::SNAPPY
1715            );
1716            assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1717            assert_eq!(
1718                props.statistics_enabled(&ColumnPath::from("col")),
1719                EnabledStatistics::Chunk
1720            );
1721            assert_eq!(
1722                props.bloom_filter_properties(&ColumnPath::from("col")),
1723                Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1724            );
1725        }
1726
1727        // Test direct build of properties
1728        test_props(&props);
1729
1730        // Test that into_builder() gives the same result
1731        let props_into_builder_and_back = props.into_builder().build();
1732        test_props(&props_into_builder_and_back);
1733    }
1734
1735    #[test]
1736    fn test_writer_properties_builder_partial_defaults() {
1737        let props = WriterProperties::builder()
1738            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1739            .set_compression(Compression::GZIP(Default::default()))
1740            .set_bloom_filter_enabled(true)
1741            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1742            .build();
1743
1744        assert_eq!(
1745            props.encoding(&ColumnPath::from("col")),
1746            Some(Encoding::RLE)
1747        );
1748        assert_eq!(
1749            props.compression(&ColumnPath::from("col")),
1750            Compression::GZIP(Default::default())
1751        );
1752        assert_eq!(
1753            props.dictionary_enabled(&ColumnPath::from("col")),
1754            DEFAULT_DICTIONARY_ENABLED
1755        );
1756        assert_eq!(
1757            props.bloom_filter_properties(&ColumnPath::from("col")),
1758            Some(&BloomFilterProperties {
1759                fpp: DEFAULT_BLOOM_FILTER_FPP,
1760                ndv: DEFAULT_BLOOM_FILTER_NDV,
1761            })
1762        );
1763    }
1764
1765    #[test]
1766    #[allow(deprecated)]
1767    fn test_writer_properties_deprecated_max_row_group_size_still_works() {
1768        let props = WriterProperties::builder()
1769            .set_max_row_group_size(42)
1770            .build();
1771
1772        assert_eq!(props.max_row_group_row_count(), Some(42));
1773        assert_eq!(props.max_row_group_size(), 42);
1774    }
1775
1776    #[test]
1777    #[should_panic(expected = "Cannot have a 0 max row group row count")]
1778    fn test_writer_properties_panic_on_zero_row_group_row_count() {
1779        let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
1780    }
1781
1782    #[test]
1783    #[should_panic(expected = "Cannot have a 0 max row group bytes")]
1784    fn test_writer_properties_panic_on_zero_row_group_bytes() {
1785        let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
1786    }
1787
1788    #[test]
1789    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1790        assert_eq!(
1791            WriterProperties::builder()
1792                .build()
1793                .bloom_filter_properties(&ColumnPath::from("col")),
1794            None
1795        );
1796        assert_eq!(
1797            WriterProperties::builder()
1798                .set_bloom_filter_ndv(100)
1799                .build()
1800                .bloom_filter_properties(&ColumnPath::from("col")),
1801            Some(&BloomFilterProperties {
1802                fpp: DEFAULT_BLOOM_FILTER_FPP,
1803                ndv: 100,
1804            })
1805        );
1806        assert_eq!(
1807            WriterProperties::builder()
1808                .set_bloom_filter_fpp(0.1)
1809                .build()
1810                .bloom_filter_properties(&ColumnPath::from("col")),
1811            Some(&BloomFilterProperties {
1812                fpp: 0.1,
1813                ndv: DEFAULT_BLOOM_FILTER_NDV,
1814            })
1815        );
1816    }
1817
1818    #[test]
1819    fn test_writer_properties_column_dictionary_page_size_limit() {
1820        let props = WriterProperties::builder()
1821            .set_dictionary_page_size_limit(100)
1822            .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1823            .build();
1824
1825        assert_eq!(props.dictionary_page_size_limit(), 100);
1826        assert_eq!(
1827            props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1828            10
1829        );
1830        assert_eq!(
1831            props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1832            100
1833        );
1834    }
1835
1836    #[test]
1837    fn test_writer_properties_column_data_page_size_limit() {
1838        let props = WriterProperties::builder()
1839            .set_data_page_size_limit(100)
1840            .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
1841            .build();
1842
1843        assert_eq!(props.data_page_size_limit(), 100);
1844        assert_eq!(
1845            props.column_data_page_size_limit(&ColumnPath::from("col")),
1846            10
1847        );
1848        assert_eq!(
1849            props.column_data_page_size_limit(&ColumnPath::from("other")),
1850            100
1851        );
1852    }
1853
1854    #[test]
1855    fn test_reader_properties_default_settings() {
1856        let props = ReaderProperties::builder().build();
1857
1858        let codec_options = CodecOptionsBuilder::default()
1859            .set_backward_compatible_lz4(true)
1860            .build();
1861
1862        assert_eq!(props.codec_options(), &codec_options);
1863        assert!(!props.read_bloom_filter());
1864    }
1865
1866    #[test]
1867    fn test_reader_properties_builder() {
1868        let props = ReaderProperties::builder()
1869            .set_backward_compatible_lz4(false)
1870            .build();
1871
1872        let codec_options = CodecOptionsBuilder::default()
1873            .set_backward_compatible_lz4(false)
1874            .build();
1875
1876        assert_eq!(props.codec_options(), &codec_options);
1877    }
1878
1879    #[test]
1880    fn test_parse_writerversion() {
1881        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1882        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1883        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1884        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1885
1886        // test lowercase
1887        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1888        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1889
1890        // test invalid version
1891        match "PARQUET_-1_0".parse::<WriterVersion>() {
1892            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1893            Err(e) => {
1894                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1895            }
1896        }
1897    }
1898
1899    #[test]
1900    fn test_parse_enabledstatistics() {
1901        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1902        assert_eq!(enabled_statistics, EnabledStatistics::None);
1903        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1904        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1905        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1906        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1907
1908        // test lowercase
1909        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1910        assert_eq!(enabled_statistics, EnabledStatistics::None);
1911
1912        //test invalid statistics
1913        match "ChunkAndPage".parse::<EnabledStatistics>() {
1914            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1915            Err(e) => {
1916                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1917            }
1918        }
1919    }
1920
1921    #[test]
1922    fn test_cdc_options_equality() {
1923        let opts = CdcOptions::default();
1924        assert_eq!(opts, CdcOptions::default());
1925
1926        let custom = CdcOptions {
1927            min_chunk_size: 1024,
1928            max_chunk_size: 8192,
1929            norm_level: 1,
1930        };
1931        assert_eq!(custom, custom);
1932        assert_ne!(opts, custom);
1933    }
1934}