parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::errors::{ParquetError, Result};
24use crate::file::metadata::{KeyValue, SortingColumn};
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::write_page_header_statistics`]
46pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
47/// Default value for [`WriterProperties::max_row_group_row_count`]
48pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
49/// Default value for [`WriterProperties::bloom_filter_position`]
50pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
51/// Default value for [`WriterProperties::created_by`]
52pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
53/// Default value for [`WriterProperties::column_index_truncate_length`]
54pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
55/// Default value for [`BloomFilterProperties::fpp()`]
56pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
57/// Default value for [`BloomFilterProperties::ndv()`].
58///
59/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
60/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
61/// but without an explicit NDV will have their NDV resolved at build time to
62/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
63/// if the user configured a custom row group size.
64pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
65/// Default values for [`WriterProperties::statistics_truncate_length`]
66pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
67/// Default value for [`WriterProperties::offset_index_disabled`]
68pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
69/// Default values for [`WriterProperties::coerce_types`]
70pub const DEFAULT_COERCE_TYPES: bool = false;
71/// Default value for [`WriterProperties::data_page_v2_compression_ratio_threshold`]
72pub const DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD: f64 = 1.0;
73/// Default value for [`WriterProperties::write_path_in_schema`]
74pub const DEFAULT_WRITE_PATH_IN_SCHEMA: bool = true;
75/// Default minimum chunk size for content-defined chunking: 256 KiB.
76pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
77/// Default maximum chunk size for content-defined chunking: 1024 KiB.
78pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024;
79/// Default normalization level for content-defined chunking.
80pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0;
81
82/// EXPERIMENTAL: Options for content-defined chunking (CDC).
83///
84/// Content-defined chunking is an experimental feature that optimizes parquet
85/// files for content addressable storage (CAS) systems by writing data pages
86/// according to content-defined chunk boundaries. This allows for more
87/// efficient deduplication of data across files, hence more efficient network
88/// transfers and storage.
89///
90/// Each content-defined chunk is written as a separate parquet data page. The
91/// following options control the chunks' size and the chunking process. Note
92/// that the chunk size is calculated based on the logical value of the data,
93/// before any encoding or compression is applied.
94#[derive(Debug, Clone, Copy, PartialEq, Eq)]
95pub struct CdcOptions {
96 /// Minimum chunk size in bytes, default is 256 KiB.
97 /// The rolling hash will not be updated until this size is reached for each chunk.
98 /// Note that all data sent through the hash function is counted towards the chunk
99 /// size, including definition and repetition levels if present.
100 pub min_chunk_size: usize,
101 /// Maximum chunk size in bytes, default is 1024 KiB.
102 /// The chunker will create a new chunk whenever the chunk size exceeds this value.
103 /// Note that the parquet writer has a related [`data_page_size_limit`] property that
104 /// controls the maximum size of a parquet data page after encoding. While setting
105 /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect
106 /// the chunking effectiveness, it results in more small parquet data pages.
107 ///
108 /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit
109 pub max_chunk_size: usize,
110 /// Number of bit adjustment to the gearhash mask in order to center the chunk size
111 /// around the average size more aggressively, default is 0.
112 /// Increasing the normalization level increases the probability of finding a chunk,
113 /// improving the deduplication ratio, but also increasing the number of small chunks
114 /// resulting in many small parquet data pages. The default value provides a good
115 /// balance between deduplication ratio and fragmentation.
116 /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
117 /// expense of fragmentation. Negative values can also be used to reduce the
118 /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
119 /// Note that values outside [-3, 3] are not recommended, prefer using the default
120 /// value of 0 for most use cases.
121 pub norm_level: i32,
122}
123
124impl Default for CdcOptions {
125 fn default() -> Self {
126 Self {
127 min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE,
128 max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE,
129 norm_level: DEFAULT_CDC_NORM_LEVEL,
130 }
131 }
132}
133
134/// Parquet writer version.
135///
136/// Basic constant, which is not part of the Thrift definition.
137#[derive(Debug, Clone, Copy, PartialEq, Eq)]
138#[allow(non_camel_case_types)]
139pub enum WriterVersion {
140 /// Parquet format version 1.0
141 PARQUET_1_0,
142 /// Parquet format version 2.0
143 PARQUET_2_0,
144}
145
146impl WriterVersion {
147 /// Returns writer version as `i32`.
148 pub fn as_num(&self) -> i32 {
149 match self {
150 WriterVersion::PARQUET_1_0 => 1,
151 WriterVersion::PARQUET_2_0 => 2,
152 }
153 }
154}
155
156impl FromStr for WriterVersion {
157 type Err = String;
158
159 fn from_str(s: &str) -> Result<Self, Self::Err> {
160 match s {
161 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
162 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
163 _ => Err(format!("Invalid writer version: {s}")),
164 }
165 }
166}
167
168/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
169/// write Bloom filters
170///
171/// Basic constant, which is not part of the Thrift definition.
172#[derive(Debug, Clone, Copy, PartialEq, Eq)]
173pub enum BloomFilterPosition {
174 /// Write Bloom Filters of each row group right after the row group
175 ///
176 /// This saves memory by writing it as soon as it is computed, at the cost
177 /// of data locality for readers
178 AfterRowGroup,
179 /// Write Bloom Filters at the end of the file
180 ///
181 /// This allows better data locality for readers, at the cost of memory usage
182 /// for writers.
183 End,
184}
185
186/// Reference counted writer properties.
187pub type WriterPropertiesPtr = Arc<WriterProperties>;
188
189/// Resolved state of [`WriterPropertiesBuilder::set_offset_index_disabled`].
190///
191/// When a user disables offset indexes but page-level statistics are enabled,
192/// the setting is overridden (offset indexes remain enabled). This enum
193/// preserves the user's original intent so that a round-trip through
194/// `WriterPropertiesBuilder` does not lose it.
195#[derive(Debug, Clone, Copy, PartialEq, Eq)]
196enum OffsetIndexSetting {
197 /// Offset indexes are enabled (the default).
198 Enabled,
199 /// User disabled offset indexes and no page-level statistics override it.
200 Disabled,
201 /// User disabled offset indexes, but page-level statistics require them,
202 /// so they remain enabled.
203 DisabledOverridden,
204}
205
206/// Configuration settings for writing parquet files.
207///
208/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
209///
210/// # Example
211///
212/// ```rust
213/// # use parquet::{
214/// # basic::{Compression, Encoding},
215/// # file::properties::*,
216/// # schema::types::ColumnPath,
217/// # };
218/// #
219/// // Create properties with default configuration.
220/// let props = WriterProperties::default();
221///
222/// // Use properties builder to set certain options and assemble the configuration.
223/// let props = WriterProperties::builder()
224/// .set_writer_version(WriterVersion::PARQUET_1_0)
225/// .set_encoding(Encoding::PLAIN)
226/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
227/// .set_compression(Compression::SNAPPY)
228/// .build();
229///
230/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
231/// assert_eq!(
232/// props.encoding(&ColumnPath::from("col1")),
233/// Some(Encoding::DELTA_BINARY_PACKED)
234/// );
235/// assert_eq!(
236/// props.encoding(&ColumnPath::from("col2")),
237/// Some(Encoding::PLAIN)
238/// );
239/// ```
240#[derive(Debug, Clone)]
241pub struct WriterProperties {
242 data_page_row_count_limit: usize,
243 write_batch_size: usize,
244 max_row_group_row_count: Option<usize>,
245 max_row_group_bytes: Option<usize>,
246 bloom_filter_position: BloomFilterPosition,
247 writer_version: WriterVersion,
248 created_by: String,
249 offset_index_setting: OffsetIndexSetting,
250 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
251 default_column_properties: ColumnProperties,
252 column_properties: HashMap<ColumnPath, ColumnProperties>,
253 sorting_columns: Option<Vec<SortingColumn>>,
254 column_index_truncate_length: Option<usize>,
255 statistics_truncate_length: Option<usize>,
256 coerce_types: bool,
257 content_defined_chunking: Option<CdcOptions>,
258 write_path_in_schema: bool,
259 #[cfg(feature = "encryption")]
260 pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
261}
262
263impl Default for WriterProperties {
264 fn default() -> Self {
265 Self::builder().build()
266 }
267}
268
269impl WriterProperties {
270 /// Create a new [`WriterProperties`] with the default settings
271 ///
272 /// See [`WriterProperties::builder`] for customising settings
273 pub fn new() -> Self {
274 Self::default()
275 }
276
277 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
278 /// properties.
279 pub fn builder() -> WriterPropertiesBuilder {
280 WriterPropertiesBuilder::default()
281 }
282
283 /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
284 /// Used for mutating existing property settings
285 pub fn into_builder(self) -> WriterPropertiesBuilder {
286 self.into()
287 }
288
289 /// Returns data page size limit.
290 ///
291 /// Note: this is a best effort limit based on the write batch size
292 ///
293 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
294 pub fn data_page_size_limit(&self) -> usize {
295 self.default_column_properties
296 .data_page_size_limit()
297 .unwrap_or(DEFAULT_PAGE_SIZE)
298 }
299
300 /// Returns data page size limit for a specific column.
301 ///
302 /// Takes precedence over [`Self::data_page_size_limit`].
303 ///
304 /// Note: this is a best effort limit based on the write batch size.
305 pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
306 self.column_properties
307 .get(col)
308 .and_then(|c| c.data_page_size_limit())
309 .or_else(|| self.default_column_properties.data_page_size_limit())
310 .unwrap_or(DEFAULT_PAGE_SIZE)
311 }
312
313 /// Returns dictionary page size limit.
314 ///
315 /// Note: this is a best effort limit based on the write batch size
316 ///
317 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
318 pub fn dictionary_page_size_limit(&self) -> usize {
319 self.default_column_properties
320 .dictionary_page_size_limit()
321 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
322 }
323
324 /// Returns dictionary page size limit for a specific column.
325 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
326 self.column_properties
327 .get(col)
328 .and_then(|c| c.dictionary_page_size_limit())
329 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
330 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
331 }
332
333 /// Returns the maximum page row count
334 ///
335 /// Note: this is a best effort limit based on the write batch size
336 ///
337 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
338 pub fn data_page_row_count_limit(&self) -> usize {
339 self.data_page_row_count_limit
340 }
341
342 /// Returns configured batch size for writes.
343 ///
344 /// When writing a batch of data, this setting allows to split it internally into
345 /// smaller batches so we can better estimate the size of a page currently being
346 /// written.
347 ///
348 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
349 pub fn write_batch_size(&self) -> usize {
350 self.write_batch_size
351 }
352
353 /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
354 ///
355 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
356 #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
357 pub fn max_row_group_size(&self) -> usize {
358 self.max_row_group_row_count.unwrap_or(usize::MAX)
359 }
360
361 /// Returns maximum number of rows in a row group, or `None` if unlimited.
362 ///
363 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
364 pub fn max_row_group_row_count(&self) -> Option<usize> {
365 self.max_row_group_row_count
366 }
367
368 /// Returns maximum size of a row group in bytes, or `None` if unlimited.
369 ///
370 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
371 pub fn max_row_group_bytes(&self) -> Option<usize> {
372 self.max_row_group_bytes
373 }
374
375 /// Returns bloom filter position.
376 ///
377 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
378 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
379 self.bloom_filter_position
380 }
381
382 /// Returns configured writer version.
383 ///
384 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
385 pub fn writer_version(&self) -> WriterVersion {
386 self.writer_version
387 }
388
389 /// Returns `created_by` string.
390 ///
391 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
392 pub fn created_by(&self) -> &str {
393 &self.created_by
394 }
395
396 /// Returns `true` if offset index writing is disabled.
397 ///
398 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
399 pub fn offset_index_disabled(&self) -> bool {
400 matches!(self.offset_index_setting, OffsetIndexSetting::Disabled)
401 }
402
403 /// Returns `key_value_metadata` KeyValue pairs.
404 ///
405 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
406 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
407 self.key_value_metadata.as_ref()
408 }
409
410 /// Returns sorting columns.
411 ///
412 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
413 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
414 self.sorting_columns.as_ref()
415 }
416
417 /// Returns the maximum length of truncated min/max values in the column index.
418 ///
419 /// `None` if truncation is disabled, must be greater than 0 otherwise.
420 ///
421 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
422 pub fn column_index_truncate_length(&self) -> Option<usize> {
423 self.column_index_truncate_length
424 }
425
426 /// Returns the maximum length of truncated min/max values in [`Statistics`].
427 ///
428 /// `None` if truncation is disabled, must be greater than 0 otherwise.
429 ///
430 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
431 ///
432 /// [`Statistics`]: crate::file::statistics::Statistics
433 pub fn statistics_truncate_length(&self) -> Option<usize> {
434 self.statistics_truncate_length
435 }
436
437 /// Returns `true` if type coercion is enabled.
438 ///
439 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
440 pub fn coerce_types(&self) -> bool {
441 self.coerce_types
442 }
443
444 /// Returns `true` if the `path_in_schema` field of the `ColumnMetaData` Thrift struct
445 /// should be written.
446 ///
447 /// For more details see [`WriterPropertiesBuilder::set_write_path_in_schema`]
448 pub fn write_path_in_schema(&self) -> bool {
449 self.write_path_in_schema
450 }
451
452 /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
453 ///
454 /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
455 pub fn content_defined_chunking(&self) -> Option<&CdcOptions> {
456 self.content_defined_chunking.as_ref()
457 }
458
459 /// Returns the compression ratio threshold at or above which a Data Page v2's
460 /// compressed values are discarded in favor of writing the values uncompressed.
461 ///
462 /// For more details see [`WriterPropertiesBuilder::set_data_page_v2_compression_ratio_threshold`]
463 pub fn data_page_v2_compression_ratio_threshold(&self) -> f64 {
464 self.default_column_properties
465 .data_page_v2_compression_ratio_threshold()
466 .unwrap_or(DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD)
467 }
468
469 /// Returns the Data Page v2 compression ratio threshold for a specific column.
470 ///
471 /// Takes precedence over [`Self::data_page_v2_compression_ratio_threshold`].
472 pub fn column_data_page_v2_compression_ratio_threshold(&self, col: &ColumnPath) -> f64 {
473 self.column_properties
474 .get(col)
475 .and_then(|c| c.data_page_v2_compression_ratio_threshold())
476 .or_else(|| {
477 self.default_column_properties
478 .data_page_v2_compression_ratio_threshold()
479 })
480 .unwrap_or(DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD)
481 }
482
483 /// Returns encoding for a data page, when dictionary encoding is enabled.
484 ///
485 /// This is not configurable.
486 #[inline]
487 pub fn dictionary_data_page_encoding(&self) -> Encoding {
488 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
489 // Dictionary values are encoded using RLE_DICTIONARY encoding.
490 Encoding::RLE_DICTIONARY
491 }
492
493 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
494 ///
495 /// This is not configurable.
496 #[inline]
497 pub fn dictionary_page_encoding(&self) -> Encoding {
498 // PLAIN_DICTIONARY is deprecated in writer version 1.
499 // Dictionary is encoded using plain encoding.
500 Encoding::PLAIN
501 }
502
503 /// Returns encoding for a column, if set.
504 ///
505 /// In case when dictionary is enabled, returns fallback encoding.
506 ///
507 /// If encoding is not set, then column writer will choose the best encoding
508 /// based on the column type.
509 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
510 self.column_properties
511 .get(col)
512 .and_then(|c| c.encoding())
513 .or_else(|| self.default_column_properties.encoding())
514 }
515
516 /// Returns compression codec for a column.
517 ///
518 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
519 pub fn compression(&self, col: &ColumnPath) -> Compression {
520 self.column_properties
521 .get(col)
522 .and_then(|c| c.compression())
523 .or_else(|| self.default_column_properties.compression())
524 .unwrap_or(DEFAULT_COMPRESSION)
525 }
526
527 /// Returns `true` if dictionary encoding is enabled for a column.
528 ///
529 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
530 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
531 self.column_properties
532 .get(col)
533 .and_then(|c| c.dictionary_enabled())
534 .or_else(|| self.default_column_properties.dictionary_enabled())
535 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
536 }
537
538 /// Returns which statistics are written for a column.
539 ///
540 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
541 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
542 self.column_properties
543 .get(col)
544 .and_then(|c| c.statistics_enabled())
545 .or_else(|| self.default_column_properties.statistics_enabled())
546 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
547 }
548
549 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
550 ///
551 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
552 ///
553 /// [`Statistics`]: crate::file::statistics::Statistics
554 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
555 self.column_properties
556 .get(col)
557 .and_then(|c| c.write_page_header_statistics())
558 .or_else(|| {
559 self.default_column_properties
560 .write_page_header_statistics()
561 })
562 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
563 }
564
565 /// Returns the [`BloomFilterProperties`] for the given column
566 ///
567 /// Returns `None` if bloom filter is disabled
568 ///
569 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
570 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
571 self.column_properties
572 .get(col)
573 .and_then(|c| c.bloom_filter_properties())
574 .or_else(|| self.default_column_properties.bloom_filter_properties())
575 }
576
577 /// Return file encryption properties
578 ///
579 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
580 #[cfg(feature = "encryption")]
581 pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
582 self.file_encryption_properties.as_ref()
583 }
584}
585
586/// Builder for [`WriterProperties`] Parquet writer configuration.
587///
588/// See example on [`WriterProperties`]
589#[derive(Debug, Clone)]
590pub struct WriterPropertiesBuilder {
591 data_page_row_count_limit: usize,
592 write_batch_size: usize,
593 max_row_group_row_count: Option<usize>,
594 max_row_group_bytes: Option<usize>,
595 bloom_filter_position: BloomFilterPosition,
596 writer_version: WriterVersion,
597 created_by: String,
598 offset_index_disabled: bool,
599 key_value_metadata: Option<Vec<KeyValue>>,
600 default_column_properties: ColumnProperties,
601 column_properties: HashMap<ColumnPath, ColumnProperties>,
602 sorting_columns: Option<Vec<SortingColumn>>,
603 column_index_truncate_length: Option<usize>,
604 statistics_truncate_length: Option<usize>,
605 coerce_types: bool,
606 content_defined_chunking: Option<CdcOptions>,
607 write_path_in_schema: bool,
608 #[cfg(feature = "encryption")]
609 file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
610}
611
612impl Default for WriterPropertiesBuilder {
613 /// Returns default state of the builder.
614 fn default() -> Self {
615 Self {
616 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
617 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
618 max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
619 max_row_group_bytes: None,
620 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
621 writer_version: DEFAULT_WRITER_VERSION,
622 created_by: DEFAULT_CREATED_BY.to_string(),
623 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
624 key_value_metadata: None,
625 default_column_properties: Default::default(),
626 column_properties: HashMap::new(),
627 sorting_columns: None,
628 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
629 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
630 coerce_types: DEFAULT_COERCE_TYPES,
631 content_defined_chunking: None,
632 write_path_in_schema: DEFAULT_WRITE_PATH_IN_SCHEMA,
633 #[cfg(feature = "encryption")]
634 file_encryption_properties: None,
635 }
636 }
637}
638
639impl WriterPropertiesBuilder {
640 /// Finalizes the configuration and returns immutable writer properties struct.
641 pub fn build(self) -> WriterProperties {
642 // Pre-compute offset_index_setting
643 let offset_index_setting = if self.offset_index_disabled {
644 let default_page_stats_enabled = self.default_column_properties.statistics_enabled()
645 == Some(EnabledStatistics::Page);
646 let column_page_stats_enabled = self.column_properties.iter().any(|path_props| {
647 path_props.1.statistics_enabled() == Some(EnabledStatistics::Page)
648 });
649 if default_page_stats_enabled || column_page_stats_enabled {
650 OffsetIndexSetting::DisabledOverridden
651 } else {
652 OffsetIndexSetting::Disabled
653 }
654 } else {
655 OffsetIndexSetting::Enabled
656 };
657
658 // Resolve bloom filter NDV for columns where it wasn't explicitly set:
659 // default to max_row_group_row_count so the filter is never undersized.
660 let default_ndv = self
661 .max_row_group_row_count
662 .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
663 let mut default_column_properties = self.default_column_properties;
664 default_column_properties.resolve_bloom_filter_ndv(default_ndv);
665 let mut column_properties = self.column_properties;
666 for props in column_properties.values_mut() {
667 props.resolve_bloom_filter_ndv(default_ndv);
668 }
669
670 WriterProperties {
671 data_page_row_count_limit: self.data_page_row_count_limit,
672 write_batch_size: self.write_batch_size,
673 max_row_group_row_count: self.max_row_group_row_count,
674 max_row_group_bytes: self.max_row_group_bytes,
675 bloom_filter_position: self.bloom_filter_position,
676 writer_version: self.writer_version,
677 created_by: self.created_by,
678 offset_index_setting,
679 key_value_metadata: self.key_value_metadata,
680 default_column_properties,
681 column_properties,
682 sorting_columns: self.sorting_columns,
683 column_index_truncate_length: self.column_index_truncate_length,
684 statistics_truncate_length: self.statistics_truncate_length,
685 coerce_types: self.coerce_types,
686 content_defined_chunking: self.content_defined_chunking,
687 write_path_in_schema: self.write_path_in_schema,
688 #[cfg(feature = "encryption")]
689 file_encryption_properties: self.file_encryption_properties,
690 }
691 }
692
693 // ----------------------------------------------------------------------
694 // Writer properties related to a file
695
696 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
697 /// via [`DEFAULT_WRITER_VERSION`])
698 ///
699 /// This value can determine what features some readers will support.
700 ///
701 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
702 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
703 self.writer_version = value;
704 self
705 }
706
707 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
708 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
709 ///
710 /// The parquet writer will attempt to limit the number of rows in
711 /// each `DataPage` to this value. Reducing this value will result
712 /// in larger parquet files, but may improve the effectiveness of
713 /// page index based predicate pushdown during reading.
714 ///
715 /// Note: this is a best effort limit based on value of
716 /// [`set_write_batch_size`](Self::set_write_batch_size).
717 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
718 self.data_page_row_count_limit = value;
719 self
720 }
721
722 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
723 ///
724 /// For performance reasons, data for each column is written in
725 /// batches of this size.
726 ///
727 /// Additional limits such as such as
728 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
729 /// are checked between batches, and thus the write batch size value acts as an
730 /// upper-bound on the enforcement granularity of other limits.
731 pub fn set_write_batch_size(mut self, value: usize) -> Self {
732 self.write_batch_size = value;
733 self
734 }
735
736 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
737 /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
738 ///
739 /// # Panics
740 /// If the value is set to 0.
741 #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
742 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
743 assert!(value > 0, "Cannot have a 0 max row group size");
744 self.max_row_group_row_count = Some(value);
745 self
746 }
747
748 /// Sets maximum number of rows in a row group, or `None` for unlimited.
749 ///
750 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
751 /// the row group with the smaller limit will be produced.
752 ///
753 /// # Panics
754 /// If the value is `Some(0)`.
755 pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
756 assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
757 self.max_row_group_row_count = value;
758 self
759 }
760
761 /// Sets maximum size of a row group in bytes, or `None` for unlimited.
762 ///
763 /// Row groups are flushed when their estimated encoded size exceeds this threshold.
764 /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
765 ///
766 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
767 /// the row group with the smaller limit will be produced.
768 ///
769 /// # Panics
770 /// If the value is `Some(0)`.
771 pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
772 assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
773 self.max_row_group_bytes = value;
774 self
775 }
776
777 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
778 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
779 ///
780 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
781 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
782 self.bloom_filter_position = value;
783 self
784 }
785
786 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
787 /// [`DEFAULT_CREATED_BY`]).
788 ///
789 /// This is a string that will be written into the file metadata
790 pub fn set_created_by(mut self, value: String) -> Self {
791 self.created_by = value;
792 self
793 }
794
795 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
796 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
797 ///
798 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
799 ///
800 /// Note: As the offset indexes are useful for accessing data by row number,
801 /// they are always written by default, regardless of whether other statistics
802 /// are enabled. Disabling this metadata may result in a degradation in read
803 /// performance, so use this option with care.
804 ///
805 /// [`Page`]: EnabledStatistics::Page
806 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
807 self.offset_index_disabled = value;
808 self
809 }
810
811 /// Sets "key_value_metadata" property (defaults to `None`).
812 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
813 self.key_value_metadata = value;
814 self
815 }
816
817 /// Sets sorting order of rows in the row group if any (defaults to `None`).
818 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
819 self.sorting_columns = value;
820 self
821 }
822
823 /// Sets the max length of min/max value fields when writing the column
824 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
825 ///
826 /// This can be used to prevent columns with very long values (hundreds of
827 /// bytes long) from causing the parquet metadata to become huge.
828 ///
829 /// # Notes
830 ///
831 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
832 /// set to [`EnabledStatistics::Page`].
833 ///
834 /// * If `Some`, must be greater than 0, otherwise will panic
835 /// * If `None`, there's no effective limit.
836 ///
837 /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
838 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
839 if let Some(value) = max_length {
840 assert!(
841 value > 0,
842 "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
843 );
844 }
845
846 self.column_index_truncate_length = max_length;
847 self
848 }
849
850 /// Sets the max length of min/max value fields in row group and data page header
851 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
852 ///
853 /// # Notes
854 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
855 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
856 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
857 /// [`EnabledStatistics::Page`].
858 ///
859 /// * If `Some`, must be greater than 0, otherwise will panic
860 /// * If `None`, there's no effective limit.
861 ///
862 /// # See also
863 /// Truncation of Page Index statistics is controlled separately via
864 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
865 ///
866 /// [`Statistics`]: crate::file::statistics::Statistics
867 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
868 if let Some(value) = max_length {
869 assert!(
870 value > 0,
871 "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
872 );
873 }
874
875 self.statistics_truncate_length = max_length;
876 self
877 }
878
879 /// Should the writer coerce types to parquet native types (defaults to `false` via
880 /// [`DEFAULT_COERCE_TYPES`]).
881 ///
882 /// Leaving this option the default `false` will ensure the exact same data
883 /// written to parquet using this library will be read.
884 ///
885 /// Setting this option to `true` will result in parquet files that can be
886 /// read by more readers, but potentially lose information in the process.
887 ///
888 /// * Types such as [`DataType::Date64`], which have no direct corresponding
889 /// Parquet type, may be stored with lower precision.
890 ///
891 /// * The internal field names of `List` and `Map` types will be renamed if
892 /// necessary to match what is required by the newest Parquet specification.
893 ///
894 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
895 ///
896 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
897 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
898 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
899 self.coerce_types = coerce_types;
900 self
901 }
902
903 /// EXPERIMENTAL: Should the writer emit the `path_in_schema` element of the
904 /// `ColumnMetaData` Thrift struct. Defaults to `true` via [`DEFAULT_WRITE_PATH_IN_SCHEMA`].
905 ///
906 /// Because `path_in_schema` is a field on the `ColumnMetaData`, it is repeated
907 /// `num_columns * num_rowgroups` times. Compounding this is any level of nesting or
908 /// repetition in the schema. For instance, a top-level list column named `foo` will have
909 /// a `path_in_schema` of `["foo", "list", "element"]`. A list-of-struct is even worse,
910 /// because the necessary list wrapping is repeated for each element of the struct. A
911 /// file with a deeply nested schema and many row groups can have a large percentage of the
912 /// footer taken up by this field. For example, a file of 38 row groups with a schema containing
913 /// several lists of structs containing lists had 36% of the footer taken up by `path_in_schema`.
914 /// Removing this redundant information can greatly speed up footer parsing, which is particularly
915 /// important in scenarios where one does not wish to read the entire file (e.g. point
916 /// lookups).
917 ///
918 /// <div class="warning">
919 ///
920 /// **WARNING:**
921 /// Setting this to `false` will break compatibility with Parquet readers that
922 /// still expect this field to be present. Virtually all Parquet readers (parquet-java,
923 /// Spark, arrow-cpp, pyarrow, pandas to name a few), with the exception
924 /// of the one in this crate, expect this field to be present, and will terminate execution
925 /// if it is not. This will continue to be the case unless/until the Parquet format
926 /// specification is explicitly changed to allow this field to be missing. As a consquence,
927 /// users should only set this to `false` if they have verified that any reader(s) they plan
928 /// to use can tolerate the absence of this field.
929 ///
930 /// For more context, see [GH-563].
931 ///
932 /// </div>
933 ///
934 /// [GH-563]: https://github.com/apache/parquet-format/issues/563
935 pub fn set_write_path_in_schema(mut self, write_path_in_schema: bool) -> Self {
936 self.write_path_in_schema = write_path_in_schema;
937 self
938 }
939
940 /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
941 ///
942 /// When enabled, data page boundaries are determined by a rolling hash of the
943 /// column values, so unchanged data produces identical byte sequences across
944 /// file versions. This enables efficient deduplication on content-addressable
945 /// storage systems.
946 ///
947 /// Only supported through the Arrow writer interface ([`ArrowWriter`]).
948 ///
949 /// # Panics
950 ///
951 /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`.
952 ///
953 /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter
954 pub fn set_content_defined_chunking(mut self, options: Option<CdcOptions>) -> Self {
955 if let Some(ref options) = options {
956 assert!(
957 options.min_chunk_size > 0,
958 "min_chunk_size must be positive"
959 );
960 assert!(
961 options.max_chunk_size > options.min_chunk_size,
962 "max_chunk_size ({}) must be greater than min_chunk_size ({})",
963 options.max_chunk_size,
964 options.min_chunk_size
965 );
966 }
967 self.content_defined_chunking = options;
968 self
969 }
970
971 /// Sets the default compression ratio threshold at or above which a Data Page
972 /// v2's compressed values are discarded in favor of writing the values
973 /// uncompressed, for all columns (defaults to `1.0` via
974 /// [`DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD`]).
975 ///
976 /// When writing a Data Page v2 with a configured compression codec, the writer
977 /// first compresses the values and then compares the compressed size to the
978 /// uncompressed size. If `compressed_size >= uncompressed_size * threshold`, the
979 /// compressed buffer is discarded and the values are written uncompressed for
980 /// that page (the page's `is_compressed` flag is set to `false`).
981 ///
982 /// The default of `1.0` preserves the historical behavior of only keeping
983 /// compression when it strictly reduces the size. Setting a value below `1.0`
984 /// requires a minimum amount of size reduction to keep the compressed page —
985 /// for example `0.9` requires at least a 10% reduction. Setting a value above
986 /// `1.0` keeps the compressed buffer even if it's somewhat larger than the
987 /// uncompressed values.
988 ///
989 /// This setting only affects Data Page v2; Data Page v1 always stores the
990 /// compressor's output regardless of the resulting size.
991 ///
992 /// # Panics
993 /// If `value` is not finite or is not strictly positive.
994 pub fn set_data_page_v2_compression_ratio_threshold(mut self, value: f64) -> Self {
995 self.default_column_properties
996 .set_data_page_v2_compression_ratio_threshold(value);
997 self
998 }
999
1000 /// Sets FileEncryptionProperties (defaults to `None`)
1001 #[cfg(feature = "encryption")]
1002 pub fn with_file_encryption_properties(
1003 mut self,
1004 file_encryption_properties: Arc<FileEncryptionProperties>,
1005 ) -> Self {
1006 self.file_encryption_properties = Some(file_encryption_properties);
1007 self
1008 }
1009
1010 // ----------------------------------------------------------------------
1011 // Setters for any column (global)
1012
1013 /// Sets default encoding for all columns.
1014 ///
1015 /// If dictionary is not enabled, this is treated as a primary encoding for all
1016 /// columns. In case when dictionary is enabled for any column, this value is
1017 /// considered to be a fallback encoding for that column.
1018 ///
1019 /// # Panics
1020 ///
1021 /// if dictionary encoding is specified, regardless of dictionary
1022 /// encoding flag being set.
1023 pub fn set_encoding(mut self, value: Encoding) -> Self {
1024 self.default_column_properties.set_encoding(value);
1025 self
1026 }
1027
1028 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
1029 /// [`DEFAULT_COMPRESSION`]).
1030 ///
1031 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
1032 pub fn set_compression(mut self, value: Compression) -> Self {
1033 self.default_column_properties.set_compression(value);
1034 self
1035 }
1036
1037 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
1038 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
1039 ///
1040 /// Use this method to set dictionary encoding, instead of explicitly specifying
1041 /// encoding in `set_encoding` method.
1042 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
1043 self.default_column_properties.set_dictionary_enabled(value);
1044 self
1045 }
1046
1047 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
1048 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
1049 ///
1050 /// The parquet writer will attempt to limit the size of each
1051 /// `DataPage` used to store dictionaries to this many
1052 /// bytes. Reducing this value will result in larger parquet
1053 /// files, but may improve the effectiveness of page index based
1054 /// predicate pushdown during reading.
1055 ///
1056 /// Note: this is a best effort limit based on value of
1057 /// [`set_write_batch_size`](Self::set_write_batch_size).
1058 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
1059 self.default_column_properties
1060 .set_dictionary_page_size_limit(value);
1061 self
1062 }
1063
1064 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
1065 /// via [`DEFAULT_PAGE_SIZE`]).
1066 ///
1067 /// The parquet writer will attempt to limit the sizes of each
1068 /// `DataPage` to this many bytes. Reducing this value will result
1069 /// in larger parquet files, but may improve the effectiveness of
1070 /// page index based predicate pushdown during reading.
1071 ///
1072 /// Note: this is a best effort limit based on value of
1073 /// [`set_write_batch_size`](Self::set_write_batch_size).
1074 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
1075 self.default_column_properties
1076 .set_data_page_size_limit(value);
1077 self
1078 }
1079
1080 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
1081 /// [`DEFAULT_STATISTICS_ENABLED`]).
1082 ///
1083 /// [`Page`]: EnabledStatistics::Page
1084 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
1085 self.default_column_properties.set_statistics_enabled(value);
1086 self
1087 }
1088
1089 /// enable/disable writing [`Statistics`] in the page header
1090 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
1091 ///
1092 /// Only applicable if [`Page`] level statistics are gathered.
1093 ///
1094 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
1095 /// file while yielding very little added benefit. Most modern Parquet implementations
1096 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
1097 /// those in the page header.
1098 ///
1099 /// # Note
1100 ///
1101 /// Prior to version 56.0.0, the `parquet` crate always wrote these
1102 /// statistics (the equivalent of setting this option to `true`). This was
1103 /// changed in 56.0.0 to follow the recommendation in the Parquet
1104 /// specification. See [issue #7580] for more details.
1105 ///
1106 /// [`Statistics`]: crate::file::statistics::Statistics
1107 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1108 /// [`Page`]: EnabledStatistics::Page
1109 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
1110 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
1111 self.default_column_properties
1112 .set_write_page_header_statistics(value);
1113 self
1114 }
1115
1116 /// Sets if bloom filter should be written for all columns (defaults to `false`).
1117 ///
1118 /// # Notes
1119 ///
1120 /// * If the bloom filter is enabled previously then it is a no-op.
1121 ///
1122 /// * If the bloom filter is not enabled, default values for ndv and fpp
1123 /// value are used used. See [`set_bloom_filter_max_ndv`] and
1124 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
1125 ///
1126 /// [`set_bloom_filter_max_ndv`]: Self::set_bloom_filter_max_ndv
1127 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
1128 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
1129 self.default_column_properties
1130 .set_bloom_filter_enabled(value);
1131 self
1132 }
1133
1134 /// Sets the default target bloom filter false positive probability (fpp)
1135 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
1136 ///
1137 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1138 /// been called.
1139 ///
1140 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1141 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
1142 self.default_column_properties.set_bloom_filter_fpp(value);
1143 self
1144 }
1145
1146 /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1147 /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
1148 ///
1149 /// The bloom filter is initially sized for this many distinct values at the
1150 /// configured FPP, then folded down after all values are inserted to achieve
1151 /// optimal size. A good heuristic is to set this to the expected number of rows
1152 /// in the row group.
1153 ///
1154 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1155 /// been called.
1156 ///
1157 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1158 pub fn set_bloom_filter_max_ndv(mut self, value: u64) -> Self {
1159 self.default_column_properties.set_bloom_filter_ndv(value);
1160 self
1161 }
1162
1163 /// Deprecated alias for [`Self::set_bloom_filter_max_ndv`].
1164 #[deprecated(since = "59.0.0", note = "Use `set_bloom_filter_max_ndv` instead")]
1165 pub fn set_bloom_filter_ndv(self, value: u64) -> Self {
1166 self.set_bloom_filter_max_ndv(value)
1167 }
1168
1169 // ----------------------------------------------------------------------
1170 // Setters for a specific column
1171
1172 /// Helper method to get existing or new mutable reference of column properties.
1173 #[inline]
1174 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
1175 self.column_properties.entry(col).or_default()
1176 }
1177
1178 /// Sets encoding for a specific column.
1179 ///
1180 /// Takes precedence over [`Self::set_encoding`].
1181 ///
1182 /// If dictionary is not enabled, this is treated as a primary encoding for this
1183 /// column. In case when dictionary is enabled for this column, either through
1184 /// global defaults or explicitly, this value is considered to be a fallback
1185 /// encoding for this column.
1186 ///
1187 /// # Panics
1188 /// If user tries to set dictionary encoding here, regardless of dictionary
1189 /// encoding flag being set.
1190 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
1191 self.get_mut_props(col).set_encoding(value);
1192 self
1193 }
1194
1195 /// Sets compression codec for a specific column.
1196 ///
1197 /// Takes precedence over [`Self::set_compression`].
1198 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
1199 self.get_mut_props(col).set_compression(value);
1200 self
1201 }
1202
1203 /// Sets flag to enable/disable dictionary encoding for a specific column.
1204 ///
1205 /// Takes precedence over [`Self::set_dictionary_enabled`].
1206 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1207 self.get_mut_props(col).set_dictionary_enabled(value);
1208 self
1209 }
1210
1211 /// Sets dictionary page size limit for a specific column.
1212 ///
1213 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
1214 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1215 self.get_mut_props(col)
1216 .set_dictionary_page_size_limit(value);
1217 self
1218 }
1219
1220 /// Sets data page size limit for a specific column.
1221 ///
1222 /// Takes precedence over [`Self::set_data_page_size_limit`].
1223 pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1224 self.get_mut_props(col).set_data_page_size_limit(value);
1225 self
1226 }
1227
1228 /// Sets [`EnabledStatistics`] level for a specific column.
1229 ///
1230 /// Takes precedence over [`Self::set_statistics_enabled`].
1231 pub fn set_column_statistics_enabled(
1232 mut self,
1233 col: ColumnPath,
1234 value: EnabledStatistics,
1235 ) -> Self {
1236 self.get_mut_props(col).set_statistics_enabled(value);
1237 self
1238 }
1239
1240 /// Sets whether to write [`Statistics`] in the page header for a specific column.
1241 ///
1242 /// Takes precedence over [`Self::set_write_page_header_statistics`].
1243 ///
1244 /// [`Statistics`]: crate::file::statistics::Statistics
1245 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
1246 self.get_mut_props(col)
1247 .set_write_page_header_statistics(value);
1248 self
1249 }
1250
1251 /// Sets whether a bloom filter should be written for a specific column.
1252 ///
1253 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
1254 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1255 self.get_mut_props(col).set_bloom_filter_enabled(value);
1256 self
1257 }
1258
1259 /// Sets the false positive probability for bloom filter for a specific column.
1260 ///
1261 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1262 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1263 self.get_mut_props(col).set_bloom_filter_fpp(value);
1264 self
1265 }
1266
1267 /// Sets the maximum expected number of distinct values for bloom filter for
1268 /// a specific column.
1269 ///
1270 /// Takes precedence over [`Self::set_bloom_filter_max_ndv`].
1271 pub fn set_column_bloom_filter_max_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1272 self.get_mut_props(col).set_bloom_filter_ndv(value);
1273 self
1274 }
1275
1276 /// Sets the Data Page v2 compression ratio threshold for a specific column.
1277 ///
1278 /// Takes precedence over [`Self::set_data_page_v2_compression_ratio_threshold`].
1279 ///
1280 /// # Panics
1281 /// If `value` is not finite or is not strictly positive.
1282 pub fn set_column_data_page_v2_compression_ratio_threshold(
1283 mut self,
1284 col: ColumnPath,
1285 value: f64,
1286 ) -> Self {
1287 self.get_mut_props(col)
1288 .set_data_page_v2_compression_ratio_threshold(value);
1289 self
1290 }
1291
1292 /// Deprecated alias for [`Self::set_column_bloom_filter_max_ndv`].
1293 #[deprecated(
1294 since = "59.0.0",
1295 note = "Use `set_column_bloom_filter_max_ndv` instead"
1296 )]
1297 pub fn set_column_bloom_filter_ndv(self, col: ColumnPath, value: u64) -> Self {
1298 self.set_column_bloom_filter_max_ndv(col, value)
1299 }
1300
1301 /// Sets the [`BloomFilterProperties`] for all columns, implicitly enabling
1302 /// the bloom filter.
1303 ///
1304 /// Both `fpp` and `ndv` from `value` are treated as explicit and will not
1305 /// be overridden by the build-time row-group-size NDV fallback. For
1306 /// dynamic NDV sizing (resolved to `max_row_group_row_count` at build
1307 /// time), use [`Self::set_bloom_filter_enabled`] or
1308 /// [`Self::set_bloom_filter_fpp`] instead.
1309 pub fn set_bloom_filter_properties(mut self, value: BloomFilterProperties) -> Self {
1310 self.default_column_properties
1311 .set_bloom_filter_properties(value);
1312 self
1313 }
1314
1315 /// Sets the [`BloomFilterProperties`] for a specific column.
1316 ///
1317 /// Takes precedence over [`Self::set_bloom_filter_properties`].
1318 pub fn set_column_bloom_filter_properties(
1319 mut self,
1320 col: ColumnPath,
1321 value: BloomFilterProperties,
1322 ) -> Self {
1323 self.get_mut_props(col).set_bloom_filter_properties(value);
1324 self
1325 }
1326}
1327
1328impl From<WriterProperties> for WriterPropertiesBuilder {
1329 fn from(props: WriterProperties) -> Self {
1330 WriterPropertiesBuilder {
1331 data_page_row_count_limit: props.data_page_row_count_limit,
1332 write_batch_size: props.write_batch_size,
1333 max_row_group_row_count: props.max_row_group_row_count,
1334 max_row_group_bytes: props.max_row_group_bytes,
1335 bloom_filter_position: props.bloom_filter_position,
1336 writer_version: props.writer_version,
1337 created_by: props.created_by,
1338 offset_index_disabled: !matches!(
1339 props.offset_index_setting,
1340 OffsetIndexSetting::Enabled
1341 ),
1342 key_value_metadata: props.key_value_metadata,
1343 default_column_properties: props.default_column_properties,
1344 column_properties: props.column_properties,
1345 sorting_columns: props.sorting_columns,
1346 column_index_truncate_length: props.column_index_truncate_length,
1347 statistics_truncate_length: props.statistics_truncate_length,
1348 coerce_types: props.coerce_types,
1349 content_defined_chunking: props.content_defined_chunking,
1350 write_path_in_schema: props.write_path_in_schema,
1351 #[cfg(feature = "encryption")]
1352 file_encryption_properties: props.file_encryption_properties,
1353 }
1354 }
1355}
1356
1357/// Controls the level of statistics to be computed by the writer and stored in
1358/// the parquet file.
1359///
1360/// Enabling statistics makes the resulting Parquet file larger and requires
1361/// more time to read the parquet footer.
1362///
1363/// Statistics can be used to improve query performance by pruning row groups
1364/// and pages during query execution if the query engine supports evaluating the
1365/// predicate using the statistics.
1366#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1367pub enum EnabledStatistics {
1368 /// Compute no statistics.
1369 None,
1370 /// Compute column chunk-level statistics but not page-level.
1371 ///
1372 /// Setting this option will store one set of statistics for each relevant
1373 /// column for each row group. The more row groups written, the more
1374 /// statistics will be stored.
1375 Chunk,
1376 /// Compute page-level and column chunk-level statistics.
1377 ///
1378 /// Setting this option will store one set of statistics for each relevant
1379 /// column for each row group. In addition, this will enable the writing
1380 /// of the column index (the offset index is always written regardless of
1381 /// this setting). See [`ParquetColumnIndex`] for
1382 /// more information.
1383 ///
1384 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1385 Page,
1386}
1387
1388impl FromStr for EnabledStatistics {
1389 type Err = String;
1390
1391 fn from_str(s: &str) -> Result<Self, Self::Err> {
1392 match s {
1393 "NONE" | "none" => Ok(EnabledStatistics::None),
1394 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1395 "PAGE" | "page" => Ok(EnabledStatistics::Page),
1396 _ => Err(format!("Invalid statistics arg: {s}")),
1397 }
1398 }
1399}
1400
1401impl Default for EnabledStatistics {
1402 fn default() -> Self {
1403 DEFAULT_STATISTICS_ENABLED
1404 }
1405}
1406
1407/// Controls the bloom filter to be computed by the writer.
1408///
1409/// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1410/// automatically folded down after all values are inserted to achieve optimal size while
1411/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1412/// folding algorithm.
1413///
1414/// # Example
1415///
1416/// ```rust
1417/// # use parquet::{
1418/// # file::properties::{BloomFilterProperties, WriterProperties},
1419/// # schema::types::ColumnPath,
1420/// # };
1421/// // Build a BloomFilterProperties via the builder, then apply it to one column.
1422/// let bf = BloomFilterProperties::builder()
1423/// .with_fpp(0.01)
1424/// .with_max_ndv(10_000)
1425/// .build();
1426///
1427/// let props = WriterProperties::builder()
1428/// .set_column_bloom_filter_properties(ColumnPath::from("user_id"), bf.clone())
1429/// .build();
1430///
1431/// assert_eq!(
1432/// props.bloom_filter_properties(&ColumnPath::from("user_id")),
1433/// Some(&bf)
1434/// );
1435/// ```
1436///
1437/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
1438#[derive(Debug, Clone, PartialEq)]
1439pub struct BloomFilterProperties {
1440 fpp: f64,
1441 ndv: u64,
1442}
1443
1444impl Default for BloomFilterProperties {
1445 fn default() -> Self {
1446 BloomFilterProperties {
1447 fpp: DEFAULT_BLOOM_FILTER_FPP,
1448 ndv: DEFAULT_BLOOM_FILTER_NDV,
1449 }
1450 }
1451}
1452
1453impl BloomFilterProperties {
1454 /// Returns a new [`BloomFilterPropertiesBuilder`] for constructing
1455 /// [`BloomFilterProperties`] with custom values.
1456 pub fn builder() -> BloomFilterPropertiesBuilder {
1457 BloomFilterPropertiesBuilder::new()
1458 }
1459
1460 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1461 ///
1462 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1463 ///
1464 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1465 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1466 /// e.g. 0.1, 0.05, or 0.001 is recommended.
1467 ///
1468 /// This value also serves as the target FPP for bloom filter folding: after all values
1469 /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
1470 pub fn fpp(&self) -> f64 {
1471 self.fpp
1472 }
1473
1474 /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1475 ///
1476 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_max_ndv`].
1477 ///
1478 /// When not explicitly set via the builder, this defaults to
1479 /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1480 /// build time). The bloom filter is initially sized for this many distinct values at the
1481 /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1482 /// is to set this to the expected number of rows in the row group. If fewer distinct values
1483 /// are actually written, the filter will be automatically compacted via folding.
1484 ///
1485 /// Thus the only negative side of overestimating this value is that the bloom filter
1486 /// will use more memory during writing than necessary, but it will not affect the final
1487 /// bloom filter size on disk.
1488 ///
1489 /// If you wish to reduce memory usage during writing and are able to make a reasonable estimate
1490 /// of the number of distinct values in a row group, it is recommended to set this value explicitly
1491 /// rather than relying on the default dynamic sizing based on `max_row_group_row_count`.
1492 /// If you do set this value explicitly it is probably best to set it for each column
1493 /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_max_ndv`] rather than globally,
1494 /// since different columns may have different numbers of distinct values.
1495 pub fn ndv(&self) -> u64 {
1496 self.ndv
1497 }
1498}
1499
1500/// Builder for [`BloomFilterProperties`].
1501///
1502/// Use [`BloomFilterProperties::builder`] or [`BloomFilterPropertiesBuilder::new`]
1503/// as the entry point.
1504#[derive(Debug, Clone, Default)]
1505pub struct BloomFilterPropertiesBuilder {
1506 fpp: Option<f64>,
1507 ndv: Option<u64>,
1508}
1509
1510impl BloomFilterPropertiesBuilder {
1511 /// Returns a new builder with no fields set.
1512 ///
1513 /// Equivalent to [`BloomFilterProperties::builder`].
1514 pub fn new() -> Self {
1515 Self::default()
1516 }
1517
1518 /// Sets the target false positive probability.
1519 ///
1520 /// The value must be in `(0.0, 1.0)` exclusively; this is validated at
1521 /// build time by [`Self::build`] / [`Self::try_build`]. When unset, the
1522 /// default is `0.05` (5%, see [`DEFAULT_BLOOM_FILTER_FPP`]).
1523 pub fn with_fpp(mut self, fpp: f64) -> Self {
1524 self.fpp = Some(fpp);
1525 self
1526 }
1527
1528 /// Sets the maximum expected number of distinct values used to size the
1529 /// bloom filter before folding.
1530 ///
1531 /// When unset, the default is `1_048_576` (see [`DEFAULT_BLOOM_FILTER_NDV`]),
1532 /// which at the default fpp of 5% reserves roughly 1 MiB per column for the
1533 /// filter bitset, derived as follows:
1534 ///
1535 /// ```text
1536 /// ndv = 1,048,576, fpp = 0.05
1537 /// 0.05^(1/8) ≈ 0.6877
1538 /// 1 - 0.6877 ≈ 0.3123
1539 /// ln(0.3123) ≈ -1.164
1540 /// num_bits = -8 * 1,048,576 / -1.164 ≈ 7,206,000 bits
1541 /// ≈ 900,750 bytes (~900 KB)
1542 /// next_power_of_two(900 KB) = 1 MiB (= 1,048,576 bytes)
1543 /// ```
1544 pub fn with_max_ndv(mut self, ndv: u64) -> Self {
1545 self.ndv = Some(ndv);
1546 self
1547 }
1548
1549 /// Builds [`BloomFilterProperties`].
1550 ///
1551 /// Panics if the configured `fpp` is not in `(0.0, 1.0)` exclusive.
1552 /// Use [`Self::try_build`] for a non-panicking alternative.
1553 pub fn build(self) -> BloomFilterProperties {
1554 self.try_build().unwrap_or_else(|e| panic!("{e}"))
1555 }
1556
1557 /// Builds [`BloomFilterProperties`], returning an error instead of
1558 /// panicking when the configured `fpp` is not in `(0.0, 1.0)` exclusive.
1559 pub fn try_build(self) -> Result<BloomFilterProperties> {
1560 let fpp = self.fpp.unwrap_or(DEFAULT_BLOOM_FILTER_FPP);
1561 validate_bloom_filter_fpp(fpp).map_err(ParquetError::General)?;
1562 let ndv = self.ndv.unwrap_or(DEFAULT_BLOOM_FILTER_NDV);
1563 Ok(BloomFilterProperties { fpp, ndv })
1564 }
1565}
1566
1567/// Single source of truth for the bloom filter fpp range check, shared by
1568/// [`ColumnProperties::set_bloom_filter_fpp`] (panic path) and
1569/// [`BloomFilterPropertiesBuilder::try_build`] (Result path).
1570fn validate_bloom_filter_fpp(fpp: f64) -> std::result::Result<(), String> {
1571 if !(fpp > 0.0 && fpp < 1.0) {
1572 return Err(format!(
1573 "fpp must be between 0.0 and 1.0 exclusive, got {fpp}"
1574 ));
1575 }
1576 Ok(())
1577}
1578
1579/// Container for column properties that can be changed as part of writer.
1580///
1581/// If a field is `None`, it means that no specific value has been set for this column,
1582/// so some subsequent or default value must be used.
1583#[derive(Debug, Clone, Default, PartialEq)]
1584struct ColumnProperties {
1585 encoding: Option<Encoding>,
1586 codec: Option<Compression>,
1587 data_page_size_limit: Option<usize>,
1588 dictionary_page_size_limit: Option<usize>,
1589 dictionary_enabled: Option<bool>,
1590 statistics_enabled: Option<EnabledStatistics>,
1591 write_page_header_statistics: Option<bool>,
1592 /// bloom filter related properties
1593 bloom_filter_properties: Option<BloomFilterProperties>,
1594 /// Whether the bloom filter NDV was explicitly set by the user
1595 bloom_filter_ndv_is_set: bool,
1596 data_page_v2_compression_ratio_threshold: Option<f64>,
1597}
1598
1599impl ColumnProperties {
1600 /// Sets encoding for this column.
1601 ///
1602 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1603 /// In case when dictionary is enabled for a column, this value is considered to
1604 /// be a fallback encoding.
1605 ///
1606 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1607 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1608 /// for a column.
1609 fn set_encoding(&mut self, value: Encoding) {
1610 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1611 panic!("Dictionary encoding can not be used as fallback encoding");
1612 }
1613 self.encoding = Some(value);
1614 }
1615
1616 /// Sets compression codec for this column.
1617 fn set_compression(&mut self, value: Compression) {
1618 self.codec = Some(value);
1619 }
1620
1621 /// Sets data page size limit for this column.
1622 fn set_data_page_size_limit(&mut self, value: usize) {
1623 self.data_page_size_limit = Some(value);
1624 }
1625
1626 /// Sets whether dictionary encoding is enabled for this column.
1627 fn set_dictionary_enabled(&mut self, enabled: bool) {
1628 self.dictionary_enabled = Some(enabled);
1629 }
1630
1631 /// Sets dictionary page size limit for this column.
1632 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1633 self.dictionary_page_size_limit = Some(value);
1634 }
1635
1636 /// Sets the statistics level for this column.
1637 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1638 self.statistics_enabled = Some(enabled);
1639 }
1640
1641 /// Sets whether to write statistics in the page header for this column.
1642 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1643 self.write_page_header_statistics = Some(enabled);
1644 }
1645
1646 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1647 /// otherwise it is a no-op.
1648 /// If `value` is `false`, resets bloom filter properties to `None`.
1649 fn set_bloom_filter_enabled(&mut self, value: bool) {
1650 if value && self.bloom_filter_properties.is_none() {
1651 self.bloom_filter_properties = Some(Default::default())
1652 } else if !value {
1653 self.bloom_filter_properties = None
1654 }
1655 }
1656
1657 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1658 /// bloom filter if not previously enabled.
1659 ///
1660 /// # Panics
1661 ///
1662 /// Panics if the `value` is not between 0 and 1 exclusive
1663 fn set_bloom_filter_fpp(&mut self, value: f64) {
1664 if let Err(msg) = validate_bloom_filter_fpp(value) {
1665 panic!("{msg}");
1666 }
1667 self.bloom_filter_properties
1668 .get_or_insert_with(Default::default)
1669 .fpp = value;
1670 }
1671
1672 /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1673 /// column, and implicitly enables bloom filter if not previously enabled.
1674 fn set_bloom_filter_ndv(&mut self, value: u64) {
1675 self.bloom_filter_properties
1676 .get_or_insert_with(Default::default)
1677 .ndv = value;
1678 self.bloom_filter_ndv_is_set = true;
1679 }
1680
1681 /// Sets the bloom filter properties for this column from a fully-built
1682 /// [`BloomFilterProperties`], implicitly enabling the bloom filter.
1683 ///
1684 /// Both `fpp` and `ndv` from `value` are treated as explicit, so the
1685 /// build-time row-group-size NDV fallback in
1686 /// [`WriterPropertiesBuilder::build`] will not override them.
1687 fn set_bloom_filter_properties(&mut self, value: BloomFilterProperties) {
1688 self.bloom_filter_properties = Some(value);
1689 self.bloom_filter_ndv_is_set = true;
1690 }
1691
1692 /// Sets the Data Page v2 compression ratio threshold for this column.
1693 ///
1694 /// # Panics
1695 /// If `value` is not finite or is not strictly positive.
1696 fn set_data_page_v2_compression_ratio_threshold(&mut self, value: f64) {
1697 assert!(
1698 value.is_finite() && value > 0.0,
1699 "data_page_v2_compression_ratio_threshold must be a positive finite number, got {value}"
1700 );
1701 self.data_page_v2_compression_ratio_threshold = Some(value);
1702 }
1703
1704 /// Returns optional encoding for this column.
1705 fn encoding(&self) -> Option<Encoding> {
1706 self.encoding
1707 }
1708
1709 /// Returns optional compression codec for this column.
1710 fn compression(&self) -> Option<Compression> {
1711 self.codec
1712 }
1713
1714 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1715 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1716 /// been provided.
1717 fn dictionary_enabled(&self) -> Option<bool> {
1718 self.dictionary_enabled
1719 }
1720
1721 /// Returns optional dictionary page size limit for this column.
1722 fn dictionary_page_size_limit(&self) -> Option<usize> {
1723 self.dictionary_page_size_limit
1724 }
1725
1726 /// Returns optional data page size limit for this column.
1727 fn data_page_size_limit(&self) -> Option<usize> {
1728 self.data_page_size_limit
1729 }
1730
1731 /// Returns optional statistics level requested for this column. If result is `None`,
1732 /// then no setting has been provided.
1733 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1734 self.statistics_enabled
1735 }
1736
1737 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1738 /// column.
1739 ///
1740 /// [`Statistics`]: crate::file::statistics::Statistics
1741 fn write_page_header_statistics(&self) -> Option<bool> {
1742 self.write_page_header_statistics
1743 }
1744
1745 /// Returns the bloom filter properties, or `None` if not enabled
1746 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1747 self.bloom_filter_properties.as_ref()
1748 }
1749
1750 /// Returns optional Data Page v2 compression ratio threshold for this column.
1751 fn data_page_v2_compression_ratio_threshold(&self) -> Option<f64> {
1752 self.data_page_v2_compression_ratio_threshold
1753 }
1754
1755 /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1756 /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1757 fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1758 if !self.bloom_filter_ndv_is_set {
1759 if let Some(ref mut bf) = self.bloom_filter_properties {
1760 bf.ndv = default_ndv;
1761 }
1762 }
1763 }
1764}
1765
1766/// Reference counted reader properties.
1767pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1768
1769const DEFAULT_READ_BLOOM_FILTER: bool = false;
1770const DEFAULT_READ_PAGE_STATS: bool = false;
1771
1772/// Configuration settings for reading parquet files.
1773///
1774/// All properties are immutable and `Send` + `Sync`.
1775/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1776///
1777/// # Example
1778///
1779/// ```rust
1780/// use parquet::file::properties::ReaderProperties;
1781///
1782/// // Create properties with default configuration.
1783/// let props = ReaderProperties::builder().build();
1784///
1785/// // Use properties builder to set certain options and assemble the configuration.
1786/// let props = ReaderProperties::builder()
1787/// .set_backward_compatible_lz4(false)
1788/// .build();
1789/// ```
1790pub struct ReaderProperties {
1791 codec_options: CodecOptions,
1792 read_bloom_filter: bool,
1793 read_page_stats: bool,
1794}
1795
1796impl ReaderProperties {
1797 /// Returns builder for reader properties with default values.
1798 pub fn builder() -> ReaderPropertiesBuilder {
1799 ReaderPropertiesBuilder::with_defaults()
1800 }
1801
1802 /// Returns codec options.
1803 pub(crate) fn codec_options(&self) -> &CodecOptions {
1804 &self.codec_options
1805 }
1806
1807 /// Returns whether to read bloom filter
1808 pub(crate) fn read_bloom_filter(&self) -> bool {
1809 self.read_bloom_filter
1810 }
1811
1812 /// Returns whether to read page level statistics
1813 pub(crate) fn read_page_stats(&self) -> bool {
1814 self.read_page_stats
1815 }
1816}
1817
1818/// Builder for parquet file reader configuration. See example on
1819/// [`ReaderProperties`]
1820pub struct ReaderPropertiesBuilder {
1821 codec_options_builder: CodecOptionsBuilder,
1822 read_bloom_filter: Option<bool>,
1823 read_page_stats: Option<bool>,
1824}
1825
1826/// Reader properties builder.
1827impl ReaderPropertiesBuilder {
1828 /// Returns default state of the builder.
1829 fn with_defaults() -> Self {
1830 Self {
1831 codec_options_builder: CodecOptionsBuilder::default(),
1832 read_bloom_filter: None,
1833 read_page_stats: None,
1834 }
1835 }
1836
1837 /// Finalizes the configuration and returns immutable reader properties struct.
1838 pub fn build(self) -> ReaderProperties {
1839 ReaderProperties {
1840 codec_options: self.codec_options_builder.build(),
1841 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1842 read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1843 }
1844 }
1845
1846 /// Enable/disable backward compatible LZ4.
1847 ///
1848 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1849 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1850 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1851 /// compatibility with files generated by older versions of parquet-cpp.
1852 ///
1853 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1854 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1855 self.codec_options_builder = self
1856 .codec_options_builder
1857 .set_backward_compatible_lz4(value);
1858 self
1859 }
1860
1861 /// Enable/disable reading bloom filter
1862 ///
1863 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1864 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1865 ///
1866 /// By default bloom filter is set to be read.
1867 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1868 self.read_bloom_filter = Some(value);
1869 self
1870 }
1871
1872 /// Enable/disable reading page-level statistics
1873 ///
1874 /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1875 /// each page, if present.
1876 /// If set to `false`, then the reader will skip decoding the statistics.
1877 ///
1878 /// By default statistics will not be decoded.
1879 ///
1880 /// [`Statistics`]: crate::file::statistics::Statistics
1881 pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1882 self.read_page_stats = Some(value);
1883 self
1884 }
1885}
1886
1887#[cfg(test)]
1888mod tests {
1889 use super::*;
1890
1891 #[test]
1892 fn test_writer_version() {
1893 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1894 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1895 }
1896
1897 #[test]
1898 fn test_writer_properties_default_settings() {
1899 let props = WriterProperties::default();
1900 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1901 assert_eq!(
1902 props.dictionary_page_size_limit(),
1903 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1904 );
1905 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1906 assert_eq!(
1907 props.max_row_group_row_count(),
1908 Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1909 );
1910 assert_eq!(props.max_row_group_bytes(), None);
1911 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1912 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1913 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1914 assert_eq!(props.key_value_metadata(), None);
1915 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1916 assert_eq!(
1917 props.compression(&ColumnPath::from("col")),
1918 DEFAULT_COMPRESSION
1919 );
1920 assert_eq!(
1921 props.dictionary_enabled(&ColumnPath::from("col")),
1922 DEFAULT_DICTIONARY_ENABLED
1923 );
1924 assert_eq!(
1925 props.statistics_enabled(&ColumnPath::from("col")),
1926 DEFAULT_STATISTICS_ENABLED
1927 );
1928 assert!(
1929 props
1930 .bloom_filter_properties(&ColumnPath::from("col"))
1931 .is_none()
1932 );
1933 }
1934
1935 #[test]
1936 fn test_writer_properties_dictionary_encoding() {
1937 // dictionary encoding is not configurable, and it should be the same for both
1938 // writer version 1 and 2.
1939 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1940 let props = WriterProperties::builder()
1941 .set_writer_version(*version)
1942 .build();
1943 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1944 assert_eq!(
1945 props.dictionary_data_page_encoding(),
1946 Encoding::RLE_DICTIONARY
1947 );
1948 }
1949 }
1950
1951 #[test]
1952 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1953 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1954 // Should panic when user specifies dictionary encoding as fallback encoding.
1955 WriterProperties::builder()
1956 .set_encoding(Encoding::PLAIN_DICTIONARY)
1957 .build();
1958 }
1959
1960 #[test]
1961 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1962 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1963 // Should panic when user specifies dictionary encoding as fallback encoding.
1964 WriterProperties::builder()
1965 .set_encoding(Encoding::RLE_DICTIONARY)
1966 .build();
1967 }
1968
1969 #[test]
1970 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1971 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1972 WriterProperties::builder()
1973 .set_dictionary_enabled(true)
1974 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1975 .build();
1976 }
1977
1978 #[test]
1979 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1980 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1981 WriterProperties::builder()
1982 .set_dictionary_enabled(false)
1983 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1984 .build();
1985 }
1986
1987 #[test]
1988 fn test_writer_properties_builder() {
1989 let props = WriterProperties::builder()
1990 // file settings
1991 .set_writer_version(WriterVersion::PARQUET_2_0)
1992 .set_data_page_size_limit(10)
1993 .set_dictionary_page_size_limit(20)
1994 .set_write_batch_size(30)
1995 .set_max_row_group_row_count(Some(40))
1996 .set_created_by("default".to_owned())
1997 .set_key_value_metadata(Some(vec![KeyValue::new(
1998 "key".to_string(),
1999 "value".to_string(),
2000 )]))
2001 // global column settings
2002 .set_encoding(Encoding::DELTA_BINARY_PACKED)
2003 .set_compression(Compression::GZIP(Default::default()))
2004 .set_dictionary_enabled(false)
2005 .set_statistics_enabled(EnabledStatistics::None)
2006 // specific column settings
2007 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
2008 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
2009 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
2010 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
2011 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
2012 .set_column_bloom_filter_max_ndv(ColumnPath::from("col"), 100_u64)
2013 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
2014 .build();
2015
2016 fn test_props(props: &WriterProperties) {
2017 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
2018 assert_eq!(props.data_page_size_limit(), 10);
2019 assert_eq!(props.dictionary_page_size_limit(), 20);
2020 assert_eq!(props.write_batch_size(), 30);
2021 assert_eq!(props.max_row_group_row_count(), Some(40));
2022 assert_eq!(props.created_by(), "default");
2023 assert_eq!(
2024 props.key_value_metadata(),
2025 Some(&vec![
2026 KeyValue::new("key".to_string(), "value".to_string(),)
2027 ])
2028 );
2029
2030 assert_eq!(
2031 props.encoding(&ColumnPath::from("a")),
2032 Some(Encoding::DELTA_BINARY_PACKED)
2033 );
2034 assert_eq!(
2035 props.compression(&ColumnPath::from("a")),
2036 Compression::GZIP(Default::default())
2037 );
2038 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
2039 assert_eq!(
2040 props.statistics_enabled(&ColumnPath::from("a")),
2041 EnabledStatistics::None
2042 );
2043
2044 assert_eq!(
2045 props.encoding(&ColumnPath::from("col")),
2046 Some(Encoding::RLE)
2047 );
2048 assert_eq!(
2049 props.compression(&ColumnPath::from("col")),
2050 Compression::SNAPPY
2051 );
2052 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
2053 assert_eq!(
2054 props.statistics_enabled(&ColumnPath::from("col")),
2055 EnabledStatistics::Chunk
2056 );
2057 assert_eq!(
2058 props.bloom_filter_properties(&ColumnPath::from("col")),
2059 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
2060 );
2061 }
2062
2063 // Test direct build of properties
2064 test_props(&props);
2065
2066 // Test that into_builder() gives the same result
2067 let props_into_builder_and_back = props.into_builder().build();
2068 test_props(&props_into_builder_and_back);
2069 }
2070
2071 #[test]
2072 fn test_writer_properties_builder_partial_defaults() {
2073 let props = WriterProperties::builder()
2074 .set_encoding(Encoding::DELTA_BINARY_PACKED)
2075 .set_compression(Compression::GZIP(Default::default()))
2076 .set_bloom_filter_enabled(true)
2077 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
2078 .build();
2079
2080 assert_eq!(
2081 props.encoding(&ColumnPath::from("col")),
2082 Some(Encoding::RLE)
2083 );
2084 assert_eq!(
2085 props.compression(&ColumnPath::from("col")),
2086 Compression::GZIP(Default::default())
2087 );
2088 assert_eq!(
2089 props.dictionary_enabled(&ColumnPath::from("col")),
2090 DEFAULT_DICTIONARY_ENABLED
2091 );
2092 assert_eq!(
2093 props.bloom_filter_properties(&ColumnPath::from("col")),
2094 Some(&BloomFilterProperties {
2095 fpp: DEFAULT_BLOOM_FILTER_FPP,
2096 ndv: DEFAULT_BLOOM_FILTER_NDV,
2097 })
2098 );
2099 }
2100
2101 #[test]
2102 #[allow(deprecated)]
2103 fn test_writer_properties_deprecated_max_row_group_size_still_works() {
2104 let props = WriterProperties::builder()
2105 .set_max_row_group_size(42)
2106 .build();
2107
2108 assert_eq!(props.max_row_group_row_count(), Some(42));
2109 assert_eq!(props.max_row_group_size(), 42);
2110 }
2111
2112 #[test]
2113 #[should_panic(expected = "Cannot have a 0 max row group row count")]
2114 fn test_writer_properties_panic_on_zero_row_group_row_count() {
2115 let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
2116 }
2117
2118 #[test]
2119 #[should_panic(expected = "Cannot have a 0 max row group bytes")]
2120 fn test_writer_properties_panic_on_zero_row_group_bytes() {
2121 let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
2122 }
2123
2124 #[test]
2125 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
2126 assert_eq!(
2127 WriterProperties::builder()
2128 .build()
2129 .bloom_filter_properties(&ColumnPath::from("col")),
2130 None
2131 );
2132 assert_eq!(
2133 WriterProperties::builder()
2134 .set_bloom_filter_max_ndv(100)
2135 .build()
2136 .bloom_filter_properties(&ColumnPath::from("col")),
2137 Some(&BloomFilterProperties {
2138 fpp: DEFAULT_BLOOM_FILTER_FPP,
2139 ndv: 100,
2140 })
2141 );
2142 assert_eq!(
2143 WriterProperties::builder()
2144 .set_bloom_filter_fpp(0.1)
2145 .build()
2146 .bloom_filter_properties(&ColumnPath::from("col")),
2147 Some(&BloomFilterProperties {
2148 fpp: 0.1,
2149 ndv: DEFAULT_BLOOM_FILTER_NDV,
2150 })
2151 );
2152 }
2153
2154 #[test]
2155 fn test_writer_properties_column_data_page_v2_compression_ratio_threshold() {
2156 let props = WriterProperties::builder()
2157 .set_data_page_v2_compression_ratio_threshold(0.5)
2158 .set_column_data_page_v2_compression_ratio_threshold(ColumnPath::from("col"), 0.1)
2159 .build();
2160
2161 assert_eq!(props.data_page_v2_compression_ratio_threshold(), 0.5);
2162 assert_eq!(
2163 props.column_data_page_v2_compression_ratio_threshold(&ColumnPath::from("col")),
2164 0.1
2165 );
2166 assert_eq!(
2167 props.column_data_page_v2_compression_ratio_threshold(&ColumnPath::from("other")),
2168 0.5
2169 );
2170 }
2171
2172 #[test]
2173 #[should_panic(
2174 expected = "data_page_v2_compression_ratio_threshold must be a positive finite number"
2175 )]
2176 fn test_writer_properties_panic_on_invalid_data_page_v2_compression_ratio_threshold() {
2177 WriterProperties::builder()
2178 .set_data_page_v2_compression_ratio_threshold(0.0)
2179 .build();
2180 }
2181
2182 #[test]
2183 #[allow(deprecated)]
2184 fn test_writer_properties_deprecated_bloom_filter_ndv_setters_still_work() {
2185 let col = ColumnPath::from("col");
2186 let props = WriterProperties::builder()
2187 .set_bloom_filter_ndv(100)
2188 .set_column_bloom_filter_ndv(col.clone(), 200)
2189 .build();
2190 assert_eq!(
2191 props.bloom_filter_properties(&ColumnPath::from("other")),
2192 Some(&BloomFilterProperties {
2193 fpp: DEFAULT_BLOOM_FILTER_FPP,
2194 ndv: 100,
2195 })
2196 );
2197 assert_eq!(
2198 props.bloom_filter_properties(&col),
2199 Some(&BloomFilterProperties {
2200 fpp: DEFAULT_BLOOM_FILTER_FPP,
2201 ndv: 200,
2202 })
2203 );
2204 }
2205
2206 #[test]
2207 fn test_writer_properties_column_dictionary_page_size_limit() {
2208 let props = WriterProperties::builder()
2209 .set_dictionary_page_size_limit(100)
2210 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
2211 .build();
2212
2213 assert_eq!(props.dictionary_page_size_limit(), 100);
2214 assert_eq!(
2215 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
2216 10
2217 );
2218 assert_eq!(
2219 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
2220 100
2221 );
2222 }
2223
2224 #[test]
2225 fn test_writer_properties_column_data_page_size_limit() {
2226 let props = WriterProperties::builder()
2227 .set_data_page_size_limit(100)
2228 .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
2229 .build();
2230
2231 assert_eq!(props.data_page_size_limit(), 100);
2232 assert_eq!(
2233 props.column_data_page_size_limit(&ColumnPath::from("col")),
2234 10
2235 );
2236 assert_eq!(
2237 props.column_data_page_size_limit(&ColumnPath::from("other")),
2238 100
2239 );
2240 }
2241
2242 #[test]
2243 fn test_reader_properties_default_settings() {
2244 let props = ReaderProperties::builder().build();
2245
2246 let codec_options = CodecOptionsBuilder::default()
2247 .set_backward_compatible_lz4(true)
2248 .build();
2249
2250 assert_eq!(props.codec_options(), &codec_options);
2251 assert!(!props.read_bloom_filter());
2252 }
2253
2254 #[test]
2255 fn test_reader_properties_builder() {
2256 let props = ReaderProperties::builder()
2257 .set_backward_compatible_lz4(false)
2258 .build();
2259
2260 let codec_options = CodecOptionsBuilder::default()
2261 .set_backward_compatible_lz4(false)
2262 .build();
2263
2264 assert_eq!(props.codec_options(), &codec_options);
2265 }
2266
2267 #[test]
2268 fn test_parse_writerversion() {
2269 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
2270 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
2271 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
2272 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
2273
2274 // test lowercase
2275 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
2276 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
2277
2278 // test invalid version
2279 match "PARQUET_-1_0".parse::<WriterVersion>() {
2280 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
2281 Err(e) => {
2282 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
2283 }
2284 }
2285 }
2286
2287 #[test]
2288 fn test_parse_enabledstatistics() {
2289 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
2290 assert_eq!(enabled_statistics, EnabledStatistics::None);
2291 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
2292 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
2293 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
2294 assert_eq!(enabled_statistics, EnabledStatistics::Page);
2295
2296 // test lowercase
2297 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
2298 assert_eq!(enabled_statistics, EnabledStatistics::None);
2299
2300 //test invalid statistics
2301 match "ChunkAndPage".parse::<EnabledStatistics>() {
2302 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
2303 Err(e) => {
2304 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
2305 }
2306 }
2307 }
2308
2309 #[test]
2310 fn test_cdc_options_equality() {
2311 let opts = CdcOptions::default();
2312 assert_eq!(opts, CdcOptions::default());
2313
2314 let custom = CdcOptions {
2315 min_chunk_size: 1024,
2316 max_chunk_size: 8192,
2317 norm_level: 1,
2318 };
2319 assert_eq!(custom, custom);
2320 assert_ne!(opts, custom);
2321 }
2322
2323 #[test]
2324 fn test_bloom_filter_builder_default() {
2325 let props = BloomFilterProperties::builder().build();
2326 assert_eq!(props.fpp, DEFAULT_BLOOM_FILTER_FPP);
2327 assert_eq!(props.ndv, DEFAULT_BLOOM_FILTER_NDV);
2328 assert_eq!(props, BloomFilterProperties::default());
2329 assert_eq!(
2330 BloomFilterPropertiesBuilder::new().build(),
2331 BloomFilterProperties::default()
2332 );
2333 }
2334
2335 #[test]
2336 fn test_bloom_filter_builder_explicit_fpp() {
2337 let props = BloomFilterProperties::builder().with_fpp(0.01).build();
2338 assert_eq!(props.fpp, 0.01);
2339 assert_eq!(props.ndv, DEFAULT_BLOOM_FILTER_NDV);
2340 }
2341
2342 #[test]
2343 fn test_bloom_filter_builder_explicit_ndv() {
2344 let props = BloomFilterProperties::builder().with_max_ndv(1000).build();
2345 assert_eq!(props.fpp, DEFAULT_BLOOM_FILTER_FPP);
2346 assert_eq!(props.ndv, 1000);
2347 }
2348
2349 #[test]
2350 fn test_bloom_filter_builder_validates_fpp() {
2351 for wrong_val in [0.0_f64, 1.0, -0.5, 2.0] {
2352 let result = std::panic::catch_unwind(|| {
2353 BloomFilterProperties::builder().with_fpp(wrong_val).build()
2354 });
2355 assert!(
2356 result.is_err(),
2357 "with_fpp({wrong_val}).build() should reject value outside (0, 1)"
2358 );
2359 }
2360 }
2361
2362 #[test]
2363 fn test_bloom_filter_builder_try_build_validates_fpp() {
2364 for wrong_val in [0.0_f64, 1.0, -0.5, 2.0] {
2365 let result = BloomFilterProperties::builder()
2366 .with_fpp(wrong_val)
2367 .try_build();
2368 assert!(
2369 result.is_err(),
2370 "try_build() should return Err for fpp outside (0, 1)"
2371 );
2372 }
2373
2374 let ok = BloomFilterProperties::builder()
2375 .with_fpp(0.01)
2376 .with_max_ndv(1000)
2377 .try_build()
2378 .expect("valid fpp should yield Ok");
2379 assert_eq!(ok.fpp, 0.01);
2380 assert_eq!(ok.ndv, 1000);
2381 }
2382
2383 #[test]
2384 fn test_column_specific_implicit_ndv_uses_row_group_size() {
2385 let custom_row_group_size: usize = 7777;
2386 let col = ColumnPath::from("col");
2387 let props = WriterProperties::builder()
2388 .set_max_row_group_row_count(Some(custom_row_group_size))
2389 .set_column_bloom_filter_enabled(col.clone(), true)
2390 .build();
2391 let bf = props
2392 .bloom_filter_properties(&col)
2393 .expect("bloom filter should be enabled for col");
2394
2395 assert_eq!(bf.ndv, custom_row_group_size as u64);
2396 assert_eq!(bf.fpp, DEFAULT_BLOOM_FILTER_FPP);
2397 }
2398
2399 #[test]
2400 fn test_set_bloom_filter_properties_applied_globally() {
2401 let bf = BloomFilterProperties::builder()
2402 .with_fpp(0.01)
2403 .with_max_ndv(500)
2404 .build();
2405 let props = WriterProperties::builder()
2406 .set_bloom_filter_properties(bf.clone())
2407 .build();
2408
2409 assert_eq!(
2410 props.bloom_filter_properties(&ColumnPath::from("a")),
2411 Some(&bf),
2412 );
2413 assert_eq!(
2414 props.bloom_filter_properties(&ColumnPath::from("b")),
2415 Some(&bf),
2416 );
2417 }
2418
2419 #[test]
2420 fn test_set_column_bloom_filter_properties_overrides_global() {
2421 let global = BloomFilterProperties::builder()
2422 .with_fpp(0.01)
2423 .with_max_ndv(500)
2424 .build();
2425 let tailored = BloomFilterProperties::builder()
2426 .with_fpp(0.02)
2427 .with_max_ndv(1000)
2428 .build();
2429
2430 let col = ColumnPath::from("col");
2431 let props = WriterProperties::builder()
2432 .set_bloom_filter_properties(global.clone())
2433 .set_column_bloom_filter_properties(col.clone(), tailored.clone())
2434 .build();
2435
2436 assert_eq!(props.bloom_filter_properties(&col), Some(&tailored));
2437 assert_eq!(
2438 props.bloom_filter_properties(&ColumnPath::from("other")),
2439 Some(&global)
2440 );
2441 }
2442
2443 #[test]
2444 fn test_set_bloom_filter_properties_preserve_explicit_ndv() {
2445 let bf = BloomFilterProperties::builder().with_max_ndv(42).build();
2446 let props = WriterProperties::builder()
2447 .set_max_row_group_row_count(Some(99_999))
2448 .set_bloom_filter_properties(bf)
2449 .build();
2450 let result = props
2451 .bloom_filter_properties(&ColumnPath::from("col"))
2452 .expect("bloom filter should be enabled");
2453
2454 assert_eq!(
2455 result.ndv, 42,
2456 "explicit ndv must not be overridden by row-group-size fallback"
2457 );
2458 }
2459}