parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_row_count`]
47pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`].
57///
58/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
59/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
60/// but without an explicit NDV will have their NDV resolved at build time to
61/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
62/// if the user configured a custom row group size.
63pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
64/// Default values for [`WriterProperties::statistics_truncate_length`]
65pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
66/// Default value for [`WriterProperties::offset_index_disabled`]
67pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
68/// Default values for [`WriterProperties::coerce_types`]
69pub const DEFAULT_COERCE_TYPES: bool = false;
70/// Default minimum chunk size for content-defined chunking: 256 KiB.
71pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
72/// Default maximum chunk size for content-defined chunking: 1024 KiB.
73pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024;
74/// Default normalization level for content-defined chunking.
75pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0;
76
77/// EXPERIMENTAL: Options for content-defined chunking (CDC).
78///
79/// Content-defined chunking is an experimental feature that optimizes parquet
80/// files for content addressable storage (CAS) systems by writing data pages
81/// according to content-defined chunk boundaries. This allows for more
82/// efficient deduplication of data across files, hence more efficient network
83/// transfers and storage.
84///
85/// Each content-defined chunk is written as a separate parquet data page. The
86/// following options control the chunks' size and the chunking process. Note
87/// that the chunk size is calculated based on the logical value of the data,
88/// before any encoding or compression is applied.
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub struct CdcOptions {
91 /// Minimum chunk size in bytes, default is 256 KiB.
92 /// The rolling hash will not be updated until this size is reached for each chunk.
93 /// Note that all data sent through the hash function is counted towards the chunk
94 /// size, including definition and repetition levels if present.
95 pub min_chunk_size: usize,
96 /// Maximum chunk size in bytes, default is 1024 KiB.
97 /// The chunker will create a new chunk whenever the chunk size exceeds this value.
98 /// Note that the parquet writer has a related [`data_page_size_limit`] property that
99 /// controls the maximum size of a parquet data page after encoding. While setting
100 /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect
101 /// the chunking effectiveness, it results in more small parquet data pages.
102 ///
103 /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit
104 pub max_chunk_size: usize,
105 /// Number of bit adjustment to the gearhash mask in order to center the chunk size
106 /// around the average size more aggressively, default is 0.
107 /// Increasing the normalization level increases the probability of finding a chunk,
108 /// improving the deduplication ratio, but also increasing the number of small chunks
109 /// resulting in many small parquet data pages. The default value provides a good
110 /// balance between deduplication ratio and fragmentation.
111 /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
112 /// expense of fragmentation. Negative values can also be used to reduce the
113 /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
114 /// Note that values outside [-3, 3] are not recommended, prefer using the default
115 /// value of 0 for most use cases.
116 pub norm_level: i32,
117}
118
119impl Default for CdcOptions {
120 fn default() -> Self {
121 Self {
122 min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE,
123 max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE,
124 norm_level: DEFAULT_CDC_NORM_LEVEL,
125 }
126 }
127}
128
129/// Parquet writer version.
130///
131/// Basic constant, which is not part of the Thrift definition.
132#[derive(Debug, Clone, Copy, PartialEq, Eq)]
133#[allow(non_camel_case_types)]
134pub enum WriterVersion {
135 /// Parquet format version 1.0
136 PARQUET_1_0,
137 /// Parquet format version 2.0
138 PARQUET_2_0,
139}
140
141impl WriterVersion {
142 /// Returns writer version as `i32`.
143 pub fn as_num(&self) -> i32 {
144 match self {
145 WriterVersion::PARQUET_1_0 => 1,
146 WriterVersion::PARQUET_2_0 => 2,
147 }
148 }
149}
150
151impl FromStr for WriterVersion {
152 type Err = String;
153
154 fn from_str(s: &str) -> Result<Self, Self::Err> {
155 match s {
156 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
157 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
158 _ => Err(format!("Invalid writer version: {s}")),
159 }
160 }
161}
162
163/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
164/// write Bloom filters
165///
166/// Basic constant, which is not part of the Thrift definition.
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168pub enum BloomFilterPosition {
169 /// Write Bloom Filters of each row group right after the row group
170 ///
171 /// This saves memory by writing it as soon as it is computed, at the cost
172 /// of data locality for readers
173 AfterRowGroup,
174 /// Write Bloom Filters at the end of the file
175 ///
176 /// This allows better data locality for readers, at the cost of memory usage
177 /// for writers.
178 End,
179}
180
181/// Reference counted writer properties.
182pub type WriterPropertiesPtr = Arc<WriterProperties>;
183
184/// Resolved state of [`WriterPropertiesBuilder::set_offset_index_disabled`].
185///
186/// When a user disables offset indexes but page-level statistics are enabled,
187/// the setting is overridden (offset indexes remain enabled). This enum
188/// preserves the user's original intent so that a round-trip through
189/// `WriterPropertiesBuilder` does not lose it.
190#[derive(Debug, Clone, Copy, PartialEq, Eq)]
191enum OffsetIndexSetting {
192 /// Offset indexes are enabled (the default).
193 Enabled,
194 /// User disabled offset indexes and no page-level statistics override it.
195 Disabled,
196 /// User disabled offset indexes, but page-level statistics require them,
197 /// so they remain enabled.
198 DisabledOverridden,
199}
200
201/// Configuration settings for writing parquet files.
202///
203/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
204///
205/// # Example
206///
207/// ```rust
208/// # use parquet::{
209/// # basic::{Compression, Encoding},
210/// # file::properties::*,
211/// # schema::types::ColumnPath,
212/// # };
213/// #
214/// // Create properties with default configuration.
215/// let props = WriterProperties::default();
216///
217/// // Use properties builder to set certain options and assemble the configuration.
218/// let props = WriterProperties::builder()
219/// .set_writer_version(WriterVersion::PARQUET_1_0)
220/// .set_encoding(Encoding::PLAIN)
221/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
222/// .set_compression(Compression::SNAPPY)
223/// .build();
224///
225/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
226/// assert_eq!(
227/// props.encoding(&ColumnPath::from("col1")),
228/// Some(Encoding::DELTA_BINARY_PACKED)
229/// );
230/// assert_eq!(
231/// props.encoding(&ColumnPath::from("col2")),
232/// Some(Encoding::PLAIN)
233/// );
234/// ```
235#[derive(Debug, Clone)]
236pub struct WriterProperties {
237 data_page_row_count_limit: usize,
238 write_batch_size: usize,
239 max_row_group_row_count: Option<usize>,
240 max_row_group_bytes: Option<usize>,
241 bloom_filter_position: BloomFilterPosition,
242 writer_version: WriterVersion,
243 created_by: String,
244 offset_index_setting: OffsetIndexSetting,
245 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
246 default_column_properties: ColumnProperties,
247 column_properties: HashMap<ColumnPath, ColumnProperties>,
248 sorting_columns: Option<Vec<SortingColumn>>,
249 column_index_truncate_length: Option<usize>,
250 statistics_truncate_length: Option<usize>,
251 coerce_types: bool,
252 content_defined_chunking: Option<CdcOptions>,
253 #[cfg(feature = "encryption")]
254 pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
255}
256
257impl Default for WriterProperties {
258 fn default() -> Self {
259 Self::builder().build()
260 }
261}
262
263impl WriterProperties {
264 /// Create a new [`WriterProperties`] with the default settings
265 ///
266 /// See [`WriterProperties::builder`] for customising settings
267 pub fn new() -> Self {
268 Self::default()
269 }
270
271 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
272 /// properties.
273 pub fn builder() -> WriterPropertiesBuilder {
274 WriterPropertiesBuilder::default()
275 }
276
277 /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
278 /// Used for mutating existing property settings
279 pub fn into_builder(self) -> WriterPropertiesBuilder {
280 self.into()
281 }
282
283 /// Returns data page size limit.
284 ///
285 /// Note: this is a best effort limit based on the write batch size
286 ///
287 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
288 pub fn data_page_size_limit(&self) -> usize {
289 self.default_column_properties
290 .data_page_size_limit()
291 .unwrap_or(DEFAULT_PAGE_SIZE)
292 }
293
294 /// Returns data page size limit for a specific column.
295 ///
296 /// Takes precedence over [`Self::data_page_size_limit`].
297 ///
298 /// Note: this is a best effort limit based on the write batch size.
299 pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
300 self.column_properties
301 .get(col)
302 .and_then(|c| c.data_page_size_limit())
303 .or_else(|| self.default_column_properties.data_page_size_limit())
304 .unwrap_or(DEFAULT_PAGE_SIZE)
305 }
306
307 /// Returns dictionary page size limit.
308 ///
309 /// Note: this is a best effort limit based on the write batch size
310 ///
311 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
312 pub fn dictionary_page_size_limit(&self) -> usize {
313 self.default_column_properties
314 .dictionary_page_size_limit()
315 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
316 }
317
318 /// Returns dictionary page size limit for a specific column.
319 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
320 self.column_properties
321 .get(col)
322 .and_then(|c| c.dictionary_page_size_limit())
323 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
324 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
325 }
326
327 /// Returns the maximum page row count
328 ///
329 /// Note: this is a best effort limit based on the write batch size
330 ///
331 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
332 pub fn data_page_row_count_limit(&self) -> usize {
333 self.data_page_row_count_limit
334 }
335
336 /// Returns configured batch size for writes.
337 ///
338 /// When writing a batch of data, this setting allows to split it internally into
339 /// smaller batches so we can better estimate the size of a page currently being
340 /// written.
341 ///
342 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
343 pub fn write_batch_size(&self) -> usize {
344 self.write_batch_size
345 }
346
347 /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
348 ///
349 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
350 #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
351 pub fn max_row_group_size(&self) -> usize {
352 self.max_row_group_row_count.unwrap_or(usize::MAX)
353 }
354
355 /// Returns maximum number of rows in a row group, or `None` if unlimited.
356 ///
357 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
358 pub fn max_row_group_row_count(&self) -> Option<usize> {
359 self.max_row_group_row_count
360 }
361
362 /// Returns maximum size of a row group in bytes, or `None` if unlimited.
363 ///
364 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
365 pub fn max_row_group_bytes(&self) -> Option<usize> {
366 self.max_row_group_bytes
367 }
368
369 /// Returns bloom filter position.
370 ///
371 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
372 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
373 self.bloom_filter_position
374 }
375
376 /// Returns configured writer version.
377 ///
378 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
379 pub fn writer_version(&self) -> WriterVersion {
380 self.writer_version
381 }
382
383 /// Returns `created_by` string.
384 ///
385 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
386 pub fn created_by(&self) -> &str {
387 &self.created_by
388 }
389
390 /// Returns `true` if offset index writing is disabled.
391 ///
392 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
393 pub fn offset_index_disabled(&self) -> bool {
394 matches!(self.offset_index_setting, OffsetIndexSetting::Disabled)
395 }
396
397 /// Returns `key_value_metadata` KeyValue pairs.
398 ///
399 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
400 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
401 self.key_value_metadata.as_ref()
402 }
403
404 /// Returns sorting columns.
405 ///
406 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
407 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
408 self.sorting_columns.as_ref()
409 }
410
411 /// Returns the maximum length of truncated min/max values in the column index.
412 ///
413 /// `None` if truncation is disabled, must be greater than 0 otherwise.
414 ///
415 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
416 pub fn column_index_truncate_length(&self) -> Option<usize> {
417 self.column_index_truncate_length
418 }
419
420 /// Returns the maximum length of truncated min/max values in [`Statistics`].
421 ///
422 /// `None` if truncation is disabled, must be greater than 0 otherwise.
423 ///
424 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
425 ///
426 /// [`Statistics`]: crate::file::statistics::Statistics
427 pub fn statistics_truncate_length(&self) -> Option<usize> {
428 self.statistics_truncate_length
429 }
430
431 /// Returns `true` if type coercion is enabled.
432 ///
433 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
434 pub fn coerce_types(&self) -> bool {
435 self.coerce_types
436 }
437
438 /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
439 ///
440 /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
441 pub fn content_defined_chunking(&self) -> Option<&CdcOptions> {
442 self.content_defined_chunking.as_ref()
443 }
444
445 /// Returns encoding for a data page, when dictionary encoding is enabled.
446 ///
447 /// This is not configurable.
448 #[inline]
449 pub fn dictionary_data_page_encoding(&self) -> Encoding {
450 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
451 // Dictionary values are encoded using RLE_DICTIONARY encoding.
452 Encoding::RLE_DICTIONARY
453 }
454
455 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
456 ///
457 /// This is not configurable.
458 #[inline]
459 pub fn dictionary_page_encoding(&self) -> Encoding {
460 // PLAIN_DICTIONARY is deprecated in writer version 1.
461 // Dictionary is encoded using plain encoding.
462 Encoding::PLAIN
463 }
464
465 /// Returns encoding for a column, if set.
466 ///
467 /// In case when dictionary is enabled, returns fallback encoding.
468 ///
469 /// If encoding is not set, then column writer will choose the best encoding
470 /// based on the column type.
471 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
472 self.column_properties
473 .get(col)
474 .and_then(|c| c.encoding())
475 .or_else(|| self.default_column_properties.encoding())
476 }
477
478 /// Returns compression codec for a column.
479 ///
480 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
481 pub fn compression(&self, col: &ColumnPath) -> Compression {
482 self.column_properties
483 .get(col)
484 .and_then(|c| c.compression())
485 .or_else(|| self.default_column_properties.compression())
486 .unwrap_or(DEFAULT_COMPRESSION)
487 }
488
489 /// Returns `true` if dictionary encoding is enabled for a column.
490 ///
491 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
492 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
493 self.column_properties
494 .get(col)
495 .and_then(|c| c.dictionary_enabled())
496 .or_else(|| self.default_column_properties.dictionary_enabled())
497 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
498 }
499
500 /// Returns which statistics are written for a column.
501 ///
502 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
503 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
504 self.column_properties
505 .get(col)
506 .and_then(|c| c.statistics_enabled())
507 .or_else(|| self.default_column_properties.statistics_enabled())
508 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
509 }
510
511 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
512 ///
513 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
514 ///
515 /// [`Statistics`]: crate::file::statistics::Statistics
516 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
517 self.column_properties
518 .get(col)
519 .and_then(|c| c.write_page_header_statistics())
520 .or_else(|| {
521 self.default_column_properties
522 .write_page_header_statistics()
523 })
524 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
525 }
526
527 /// Returns the [`BloomFilterProperties`] for the given column
528 ///
529 /// Returns `None` if bloom filter is disabled
530 ///
531 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
532 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
533 self.column_properties
534 .get(col)
535 .and_then(|c| c.bloom_filter_properties())
536 .or_else(|| self.default_column_properties.bloom_filter_properties())
537 }
538
539 /// Return file encryption properties
540 ///
541 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
542 #[cfg(feature = "encryption")]
543 pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
544 self.file_encryption_properties.as_ref()
545 }
546}
547
548/// Builder for [`WriterProperties`] Parquet writer configuration.
549///
550/// See example on [`WriterProperties`]
551#[derive(Debug, Clone)]
552pub struct WriterPropertiesBuilder {
553 data_page_row_count_limit: usize,
554 write_batch_size: usize,
555 max_row_group_row_count: Option<usize>,
556 max_row_group_bytes: Option<usize>,
557 bloom_filter_position: BloomFilterPosition,
558 writer_version: WriterVersion,
559 created_by: String,
560 offset_index_disabled: bool,
561 key_value_metadata: Option<Vec<KeyValue>>,
562 default_column_properties: ColumnProperties,
563 column_properties: HashMap<ColumnPath, ColumnProperties>,
564 sorting_columns: Option<Vec<SortingColumn>>,
565 column_index_truncate_length: Option<usize>,
566 statistics_truncate_length: Option<usize>,
567 coerce_types: bool,
568 content_defined_chunking: Option<CdcOptions>,
569 #[cfg(feature = "encryption")]
570 file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
571}
572
573impl Default for WriterPropertiesBuilder {
574 /// Returns default state of the builder.
575 fn default() -> Self {
576 Self {
577 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
578 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
579 max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
580 max_row_group_bytes: None,
581 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
582 writer_version: DEFAULT_WRITER_VERSION,
583 created_by: DEFAULT_CREATED_BY.to_string(),
584 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
585 key_value_metadata: None,
586 default_column_properties: Default::default(),
587 column_properties: HashMap::new(),
588 sorting_columns: None,
589 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
590 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
591 coerce_types: DEFAULT_COERCE_TYPES,
592 content_defined_chunking: None,
593 #[cfg(feature = "encryption")]
594 file_encryption_properties: None,
595 }
596 }
597}
598
599impl WriterPropertiesBuilder {
600 /// Finalizes the configuration and returns immutable writer properties struct.
601 pub fn build(self) -> WriterProperties {
602 // Pre-compute offset_index_setting
603 let offset_index_setting = if self.offset_index_disabled {
604 let default_page_stats_enabled = self.default_column_properties.statistics_enabled()
605 == Some(EnabledStatistics::Page);
606 let column_page_stats_enabled = self.column_properties.iter().any(|path_props| {
607 path_props.1.statistics_enabled() == Some(EnabledStatistics::Page)
608 });
609 if default_page_stats_enabled || column_page_stats_enabled {
610 OffsetIndexSetting::DisabledOverridden
611 } else {
612 OffsetIndexSetting::Disabled
613 }
614 } else {
615 OffsetIndexSetting::Enabled
616 };
617
618 // Resolve bloom filter NDV for columns where it wasn't explicitly set:
619 // default to max_row_group_row_count so the filter is never undersized.
620 let default_ndv = self
621 .max_row_group_row_count
622 .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
623 let mut default_column_properties = self.default_column_properties;
624 default_column_properties.resolve_bloom_filter_ndv(default_ndv);
625 let mut column_properties = self.column_properties;
626 for props in column_properties.values_mut() {
627 props.resolve_bloom_filter_ndv(default_ndv);
628 }
629
630 WriterProperties {
631 data_page_row_count_limit: self.data_page_row_count_limit,
632 write_batch_size: self.write_batch_size,
633 max_row_group_row_count: self.max_row_group_row_count,
634 max_row_group_bytes: self.max_row_group_bytes,
635 bloom_filter_position: self.bloom_filter_position,
636 writer_version: self.writer_version,
637 created_by: self.created_by,
638 offset_index_setting,
639 key_value_metadata: self.key_value_metadata,
640 default_column_properties,
641 column_properties,
642 sorting_columns: self.sorting_columns,
643 column_index_truncate_length: self.column_index_truncate_length,
644 statistics_truncate_length: self.statistics_truncate_length,
645 coerce_types: self.coerce_types,
646 content_defined_chunking: self.content_defined_chunking,
647 #[cfg(feature = "encryption")]
648 file_encryption_properties: self.file_encryption_properties,
649 }
650 }
651
652 // ----------------------------------------------------------------------
653 // Writer properties related to a file
654
655 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
656 /// via [`DEFAULT_WRITER_VERSION`])
657 ///
658 /// This value can determine what features some readers will support.
659 ///
660 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
661 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
662 self.writer_version = value;
663 self
664 }
665
666 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
667 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
668 ///
669 /// The parquet writer will attempt to limit the number of rows in
670 /// each `DataPage` to this value. Reducing this value will result
671 /// in larger parquet files, but may improve the effectiveness of
672 /// page index based predicate pushdown during reading.
673 ///
674 /// Note: this is a best effort limit based on value of
675 /// [`set_write_batch_size`](Self::set_write_batch_size).
676 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
677 self.data_page_row_count_limit = value;
678 self
679 }
680
681 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
682 ///
683 /// For performance reasons, data for each column is written in
684 /// batches of this size.
685 ///
686 /// Additional limits such as such as
687 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
688 /// are checked between batches, and thus the write batch size value acts as an
689 /// upper-bound on the enforcement granularity of other limits.
690 pub fn set_write_batch_size(mut self, value: usize) -> Self {
691 self.write_batch_size = value;
692 self
693 }
694
695 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
696 /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
697 ///
698 /// # Panics
699 /// If the value is set to 0.
700 #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
701 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
702 assert!(value > 0, "Cannot have a 0 max row group size");
703 self.max_row_group_row_count = Some(value);
704 self
705 }
706
707 /// Sets maximum number of rows in a row group, or `None` for unlimited.
708 ///
709 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
710 /// the row group with the smaller limit will be produced.
711 ///
712 /// # Panics
713 /// If the value is `Some(0)`.
714 pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
715 assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
716 self.max_row_group_row_count = value;
717 self
718 }
719
720 /// Sets maximum size of a row group in bytes, or `None` for unlimited.
721 ///
722 /// Row groups are flushed when their estimated encoded size exceeds this threshold.
723 /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
724 ///
725 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
726 /// the row group with the smaller limit will be produced.
727 ///
728 /// # Panics
729 /// If the value is `Some(0)`.
730 pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
731 assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
732 self.max_row_group_bytes = value;
733 self
734 }
735
736 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
737 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
738 ///
739 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
740 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
741 self.bloom_filter_position = value;
742 self
743 }
744
745 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
746 /// [`DEFAULT_CREATED_BY`]).
747 ///
748 /// This is a string that will be written into the file metadata
749 pub fn set_created_by(mut self, value: String) -> Self {
750 self.created_by = value;
751 self
752 }
753
754 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
755 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
756 ///
757 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
758 ///
759 /// Note: As the offset indexes are useful for accessing data by row number,
760 /// they are always written by default, regardless of whether other statistics
761 /// are enabled. Disabling this metadata may result in a degradation in read
762 /// performance, so use this option with care.
763 ///
764 /// [`Page`]: EnabledStatistics::Page
765 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
766 self.offset_index_disabled = value;
767 self
768 }
769
770 /// Sets "key_value_metadata" property (defaults to `None`).
771 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
772 self.key_value_metadata = value;
773 self
774 }
775
776 /// Sets sorting order of rows in the row group if any (defaults to `None`).
777 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
778 self.sorting_columns = value;
779 self
780 }
781
782 /// Sets the max length of min/max value fields when writing the column
783 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
784 ///
785 /// This can be used to prevent columns with very long values (hundreds of
786 /// bytes long) from causing the parquet metadata to become huge.
787 ///
788 /// # Notes
789 ///
790 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
791 /// set to [`EnabledStatistics::Page`].
792 ///
793 /// * If `Some`, must be greater than 0, otherwise will panic
794 /// * If `None`, there's no effective limit.
795 ///
796 /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
797 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
798 if let Some(value) = max_length {
799 assert!(
800 value > 0,
801 "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
802 );
803 }
804
805 self.column_index_truncate_length = max_length;
806 self
807 }
808
809 /// Sets the max length of min/max value fields in row group and data page header
810 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
811 ///
812 /// # Notes
813 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
814 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
815 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
816 /// [`EnabledStatistics::Page`].
817 ///
818 /// * If `Some`, must be greater than 0, otherwise will panic
819 /// * If `None`, there's no effective limit.
820 ///
821 /// # See also
822 /// Truncation of Page Index statistics is controlled separately via
823 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
824 ///
825 /// [`Statistics`]: crate::file::statistics::Statistics
826 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
827 if let Some(value) = max_length {
828 assert!(
829 value > 0,
830 "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
831 );
832 }
833
834 self.statistics_truncate_length = max_length;
835 self
836 }
837
838 /// Should the writer coerce types to parquet native types (defaults to `false` via
839 /// [`DEFAULT_COERCE_TYPES`]).
840 ///
841 /// Leaving this option the default `false` will ensure the exact same data
842 /// written to parquet using this library will be read.
843 ///
844 /// Setting this option to `true` will result in parquet files that can be
845 /// read by more readers, but potentially lose information in the process.
846 ///
847 /// * Types such as [`DataType::Date64`], which have no direct corresponding
848 /// Parquet type, may be stored with lower precision.
849 ///
850 /// * The internal field names of `List` and `Map` types will be renamed if
851 /// necessary to match what is required by the newest Parquet specification.
852 ///
853 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
854 ///
855 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
856 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
857 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
858 self.coerce_types = coerce_types;
859 self
860 }
861
862 /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
863 ///
864 /// When enabled, data page boundaries are determined by a rolling hash of the
865 /// column values, so unchanged data produces identical byte sequences across
866 /// file versions. This enables efficient deduplication on content-addressable
867 /// storage systems.
868 ///
869 /// Only supported through the Arrow writer interface ([`ArrowWriter`]).
870 ///
871 /// # Panics
872 ///
873 /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`.
874 ///
875 /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter
876 pub fn set_content_defined_chunking(mut self, options: Option<CdcOptions>) -> Self {
877 if let Some(ref options) = options {
878 assert!(
879 options.min_chunk_size > 0,
880 "min_chunk_size must be positive"
881 );
882 assert!(
883 options.max_chunk_size > options.min_chunk_size,
884 "max_chunk_size ({}) must be greater than min_chunk_size ({})",
885 options.max_chunk_size,
886 options.min_chunk_size
887 );
888 }
889 self.content_defined_chunking = options;
890 self
891 }
892
893 /// Sets FileEncryptionProperties (defaults to `None`)
894 #[cfg(feature = "encryption")]
895 pub fn with_file_encryption_properties(
896 mut self,
897 file_encryption_properties: Arc<FileEncryptionProperties>,
898 ) -> Self {
899 self.file_encryption_properties = Some(file_encryption_properties);
900 self
901 }
902
903 // ----------------------------------------------------------------------
904 // Setters for any column (global)
905
906 /// Sets default encoding for all columns.
907 ///
908 /// If dictionary is not enabled, this is treated as a primary encoding for all
909 /// columns. In case when dictionary is enabled for any column, this value is
910 /// considered to be a fallback encoding for that column.
911 ///
912 /// # Panics
913 ///
914 /// if dictionary encoding is specified, regardless of dictionary
915 /// encoding flag being set.
916 pub fn set_encoding(mut self, value: Encoding) -> Self {
917 self.default_column_properties.set_encoding(value);
918 self
919 }
920
921 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
922 /// [`DEFAULT_COMPRESSION`]).
923 ///
924 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
925 pub fn set_compression(mut self, value: Compression) -> Self {
926 self.default_column_properties.set_compression(value);
927 self
928 }
929
930 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
931 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
932 ///
933 /// Use this method to set dictionary encoding, instead of explicitly specifying
934 /// encoding in `set_encoding` method.
935 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
936 self.default_column_properties.set_dictionary_enabled(value);
937 self
938 }
939
940 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
941 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
942 ///
943 /// The parquet writer will attempt to limit the size of each
944 /// `DataPage` used to store dictionaries to this many
945 /// bytes. Reducing this value will result in larger parquet
946 /// files, but may improve the effectiveness of page index based
947 /// predicate pushdown during reading.
948 ///
949 /// Note: this is a best effort limit based on value of
950 /// [`set_write_batch_size`](Self::set_write_batch_size).
951 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
952 self.default_column_properties
953 .set_dictionary_page_size_limit(value);
954 self
955 }
956
957 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
958 /// via [`DEFAULT_PAGE_SIZE`]).
959 ///
960 /// The parquet writer will attempt to limit the sizes of each
961 /// `DataPage` to this many bytes. Reducing this value will result
962 /// in larger parquet files, but may improve the effectiveness of
963 /// page index based predicate pushdown during reading.
964 ///
965 /// Note: this is a best effort limit based on value of
966 /// [`set_write_batch_size`](Self::set_write_batch_size).
967 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
968 self.default_column_properties
969 .set_data_page_size_limit(value);
970 self
971 }
972
973 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
974 /// [`DEFAULT_STATISTICS_ENABLED`]).
975 ///
976 /// [`Page`]: EnabledStatistics::Page
977 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
978 self.default_column_properties.set_statistics_enabled(value);
979 self
980 }
981
982 /// enable/disable writing [`Statistics`] in the page header
983 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
984 ///
985 /// Only applicable if [`Page`] level statistics are gathered.
986 ///
987 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
988 /// file while yielding very little added benefit. Most modern Parquet implementations
989 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
990 /// those in the page header.
991 ///
992 /// # Note
993 ///
994 /// Prior to version 56.0.0, the `parquet` crate always wrote these
995 /// statistics (the equivalent of setting this option to `true`). This was
996 /// changed in 56.0.0 to follow the recommendation in the Parquet
997 /// specification. See [issue #7580] for more details.
998 ///
999 /// [`Statistics`]: crate::file::statistics::Statistics
1000 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1001 /// [`Page`]: EnabledStatistics::Page
1002 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
1003 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
1004 self.default_column_properties
1005 .set_write_page_header_statistics(value);
1006 self
1007 }
1008
1009 /// Sets if bloom filter should be written for all columns (defaults to `false`).
1010 ///
1011 /// # Notes
1012 ///
1013 /// * If the bloom filter is enabled previously then it is a no-op.
1014 ///
1015 /// * If the bloom filter is not enabled, default values for ndv and fpp
1016 /// value are used used. See [`set_bloom_filter_ndv`] and
1017 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
1018 ///
1019 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
1020 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
1021 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
1022 self.default_column_properties
1023 .set_bloom_filter_enabled(value);
1024 self
1025 }
1026
1027 /// Sets the default target bloom filter false positive probability (fpp)
1028 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
1029 ///
1030 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1031 /// been called.
1032 ///
1033 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1034 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
1035 self.default_column_properties.set_bloom_filter_fpp(value);
1036 self
1037 }
1038
1039 /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1040 /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
1041 ///
1042 /// The bloom filter is initially sized for this many distinct values at the
1043 /// configured FPP, then folded down after all values are inserted to achieve
1044 /// optimal size. A good heuristic is to set this to the expected number of rows
1045 /// in the row group.
1046 ///
1047 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1048 /// been called.
1049 ///
1050 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1051 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
1052 self.default_column_properties.set_bloom_filter_ndv(value);
1053 self
1054 }
1055
1056 // ----------------------------------------------------------------------
1057 // Setters for a specific column
1058
1059 /// Helper method to get existing or new mutable reference of column properties.
1060 #[inline]
1061 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
1062 self.column_properties.entry(col).or_default()
1063 }
1064
1065 /// Sets encoding for a specific column.
1066 ///
1067 /// Takes precedence over [`Self::set_encoding`].
1068 ///
1069 /// If dictionary is not enabled, this is treated as a primary encoding for this
1070 /// column. In case when dictionary is enabled for this column, either through
1071 /// global defaults or explicitly, this value is considered to be a fallback
1072 /// encoding for this column.
1073 ///
1074 /// # Panics
1075 /// If user tries to set dictionary encoding here, regardless of dictionary
1076 /// encoding flag being set.
1077 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
1078 self.get_mut_props(col).set_encoding(value);
1079 self
1080 }
1081
1082 /// Sets compression codec for a specific column.
1083 ///
1084 /// Takes precedence over [`Self::set_compression`].
1085 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
1086 self.get_mut_props(col).set_compression(value);
1087 self
1088 }
1089
1090 /// Sets flag to enable/disable dictionary encoding for a specific column.
1091 ///
1092 /// Takes precedence over [`Self::set_dictionary_enabled`].
1093 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1094 self.get_mut_props(col).set_dictionary_enabled(value);
1095 self
1096 }
1097
1098 /// Sets dictionary page size limit for a specific column.
1099 ///
1100 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
1101 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1102 self.get_mut_props(col)
1103 .set_dictionary_page_size_limit(value);
1104 self
1105 }
1106
1107 /// Sets data page size limit for a specific column.
1108 ///
1109 /// Takes precedence over [`Self::set_data_page_size_limit`].
1110 pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1111 self.get_mut_props(col).set_data_page_size_limit(value);
1112 self
1113 }
1114
1115 /// Sets [`EnabledStatistics`] level for a specific column.
1116 ///
1117 /// Takes precedence over [`Self::set_statistics_enabled`].
1118 pub fn set_column_statistics_enabled(
1119 mut self,
1120 col: ColumnPath,
1121 value: EnabledStatistics,
1122 ) -> Self {
1123 self.get_mut_props(col).set_statistics_enabled(value);
1124 self
1125 }
1126
1127 /// Sets whether to write [`Statistics`] in the page header for a specific column.
1128 ///
1129 /// Takes precedence over [`Self::set_write_page_header_statistics`].
1130 ///
1131 /// [`Statistics`]: crate::file::statistics::Statistics
1132 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
1133 self.get_mut_props(col)
1134 .set_write_page_header_statistics(value);
1135 self
1136 }
1137
1138 /// Sets whether a bloom filter should be written for a specific column.
1139 ///
1140 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
1141 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1142 self.get_mut_props(col).set_bloom_filter_enabled(value);
1143 self
1144 }
1145
1146 /// Sets the false positive probability for bloom filter for a specific column.
1147 ///
1148 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1149 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1150 self.get_mut_props(col).set_bloom_filter_fpp(value);
1151 self
1152 }
1153
1154 /// Sets the number of distinct values for bloom filter for a specific column.
1155 ///
1156 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
1157 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1158 self.get_mut_props(col).set_bloom_filter_ndv(value);
1159 self
1160 }
1161}
1162
1163impl From<WriterProperties> for WriterPropertiesBuilder {
1164 fn from(props: WriterProperties) -> Self {
1165 WriterPropertiesBuilder {
1166 data_page_row_count_limit: props.data_page_row_count_limit,
1167 write_batch_size: props.write_batch_size,
1168 max_row_group_row_count: props.max_row_group_row_count,
1169 max_row_group_bytes: props.max_row_group_bytes,
1170 bloom_filter_position: props.bloom_filter_position,
1171 writer_version: props.writer_version,
1172 created_by: props.created_by,
1173 offset_index_disabled: !matches!(
1174 props.offset_index_setting,
1175 OffsetIndexSetting::Enabled
1176 ),
1177 key_value_metadata: props.key_value_metadata,
1178 default_column_properties: props.default_column_properties,
1179 column_properties: props.column_properties,
1180 sorting_columns: props.sorting_columns,
1181 column_index_truncate_length: props.column_index_truncate_length,
1182 statistics_truncate_length: props.statistics_truncate_length,
1183 coerce_types: props.coerce_types,
1184 content_defined_chunking: props.content_defined_chunking,
1185 #[cfg(feature = "encryption")]
1186 file_encryption_properties: props.file_encryption_properties,
1187 }
1188 }
1189}
1190
1191/// Controls the level of statistics to be computed by the writer and stored in
1192/// the parquet file.
1193///
1194/// Enabling statistics makes the resulting Parquet file larger and requires
1195/// more time to read the parquet footer.
1196///
1197/// Statistics can be used to improve query performance by pruning row groups
1198/// and pages during query execution if the query engine supports evaluating the
1199/// predicate using the statistics.
1200#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1201pub enum EnabledStatistics {
1202 /// Compute no statistics.
1203 None,
1204 /// Compute column chunk-level statistics but not page-level.
1205 ///
1206 /// Setting this option will store one set of statistics for each relevant
1207 /// column for each row group. The more row groups written, the more
1208 /// statistics will be stored.
1209 Chunk,
1210 /// Compute page-level and column chunk-level statistics.
1211 ///
1212 /// Setting this option will store one set of statistics for each relevant
1213 /// column for each row group. In addition, this will enable the writing
1214 /// of the column index (the offset index is always written regardless of
1215 /// this setting). See [`ParquetColumnIndex`] for
1216 /// more information.
1217 ///
1218 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1219 Page,
1220}
1221
1222impl FromStr for EnabledStatistics {
1223 type Err = String;
1224
1225 fn from_str(s: &str) -> Result<Self, Self::Err> {
1226 match s {
1227 "NONE" | "none" => Ok(EnabledStatistics::None),
1228 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1229 "PAGE" | "page" => Ok(EnabledStatistics::Page),
1230 _ => Err(format!("Invalid statistics arg: {s}")),
1231 }
1232 }
1233}
1234
1235impl Default for EnabledStatistics {
1236 fn default() -> Self {
1237 DEFAULT_STATISTICS_ENABLED
1238 }
1239}
1240
1241/// Controls the bloom filter to be computed by the writer.
1242///
1243/// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1244/// automatically folded down after all values are inserted to achieve optimal size while
1245/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1246/// folding algorithm.
1247///
1248/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
1249#[derive(Debug, Clone, PartialEq)]
1250pub struct BloomFilterProperties {
1251 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1252 ///
1253 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1254 ///
1255 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1256 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1257 /// e.g. 0.1, 0.05, or 0.001 is recommended.
1258 ///
1259 /// This value also serves as the target FPP for bloom filter folding: after all values
1260 /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
1261 pub fpp: f64,
1262 /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1263 ///
1264 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1265 ///
1266 /// When not explicitly set via the builder, this defaults to
1267 /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1268 /// build time). The bloom filter is initially sized for this many distinct values at the
1269 /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1270 /// is to set this to the expected number of rows in the row group. If fewer distinct values
1271 /// are actually written, the filter will be automatically compacted via folding.
1272 ///
1273 /// Thus the only negative side of overestimating this value is that the bloom filter
1274 /// will use more memory during writing than necessary, but it will not affect the final
1275 /// bloom filter size on disk.
1276 ///
1277 /// If you wish to reduce memory usage during writing and are able to make a reasonable estimate
1278 /// of the number of distinct values in a row group, it is recommended to set this value explicitly
1279 /// rather than relying on the default dynamic sizing based on `max_row_group_row_count`.
1280 /// If you do set this value explicitly it is probably best to set it for each column
1281 /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
1282 /// since different columns may have different numbers of distinct values.
1283 pub ndv: u64,
1284}
1285
1286impl Default for BloomFilterProperties {
1287 fn default() -> Self {
1288 BloomFilterProperties {
1289 fpp: DEFAULT_BLOOM_FILTER_FPP,
1290 ndv: DEFAULT_BLOOM_FILTER_NDV,
1291 }
1292 }
1293}
1294
1295/// Container for column properties that can be changed as part of writer.
1296///
1297/// If a field is `None`, it means that no specific value has been set for this column,
1298/// so some subsequent or default value must be used.
1299#[derive(Debug, Clone, Default, PartialEq)]
1300struct ColumnProperties {
1301 encoding: Option<Encoding>,
1302 codec: Option<Compression>,
1303 data_page_size_limit: Option<usize>,
1304 dictionary_page_size_limit: Option<usize>,
1305 dictionary_enabled: Option<bool>,
1306 statistics_enabled: Option<EnabledStatistics>,
1307 write_page_header_statistics: Option<bool>,
1308 /// bloom filter related properties
1309 bloom_filter_properties: Option<BloomFilterProperties>,
1310 /// Whether the bloom filter NDV was explicitly set by the user
1311 bloom_filter_ndv_is_set: bool,
1312}
1313
1314impl ColumnProperties {
1315 /// Sets encoding for this column.
1316 ///
1317 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1318 /// In case when dictionary is enabled for a column, this value is considered to
1319 /// be a fallback encoding.
1320 ///
1321 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1322 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1323 /// for a column.
1324 fn set_encoding(&mut self, value: Encoding) {
1325 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1326 panic!("Dictionary encoding can not be used as fallback encoding");
1327 }
1328 self.encoding = Some(value);
1329 }
1330
1331 /// Sets compression codec for this column.
1332 fn set_compression(&mut self, value: Compression) {
1333 self.codec = Some(value);
1334 }
1335
1336 /// Sets data page size limit for this column.
1337 fn set_data_page_size_limit(&mut self, value: usize) {
1338 self.data_page_size_limit = Some(value);
1339 }
1340
1341 /// Sets whether dictionary encoding is enabled for this column.
1342 fn set_dictionary_enabled(&mut self, enabled: bool) {
1343 self.dictionary_enabled = Some(enabled);
1344 }
1345
1346 /// Sets dictionary page size limit for this column.
1347 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1348 self.dictionary_page_size_limit = Some(value);
1349 }
1350
1351 /// Sets the statistics level for this column.
1352 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1353 self.statistics_enabled = Some(enabled);
1354 }
1355
1356 /// Sets whether to write statistics in the page header for this column.
1357 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1358 self.write_page_header_statistics = Some(enabled);
1359 }
1360
1361 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1362 /// otherwise it is a no-op.
1363 /// If `value` is `false`, resets bloom filter properties to `None`.
1364 fn set_bloom_filter_enabled(&mut self, value: bool) {
1365 if value && self.bloom_filter_properties.is_none() {
1366 self.bloom_filter_properties = Some(Default::default())
1367 } else if !value {
1368 self.bloom_filter_properties = None
1369 }
1370 }
1371
1372 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1373 /// bloom filter if not previously enabled.
1374 ///
1375 /// # Panics
1376 ///
1377 /// Panics if the `value` is not between 0 and 1 exclusive
1378 fn set_bloom_filter_fpp(&mut self, value: f64) {
1379 assert!(
1380 value > 0. && value < 1.0,
1381 "fpp must be between 0 and 1 exclusive, got {value}"
1382 );
1383
1384 self.bloom_filter_properties
1385 .get_or_insert_with(Default::default)
1386 .fpp = value;
1387 }
1388
1389 /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1390 /// column, and implicitly enables bloom filter if not previously enabled.
1391 fn set_bloom_filter_ndv(&mut self, value: u64) {
1392 self.bloom_filter_properties
1393 .get_or_insert_with(Default::default)
1394 .ndv = value;
1395 self.bloom_filter_ndv_is_set = true;
1396 }
1397
1398 /// Returns optional encoding for this column.
1399 fn encoding(&self) -> Option<Encoding> {
1400 self.encoding
1401 }
1402
1403 /// Returns optional compression codec for this column.
1404 fn compression(&self) -> Option<Compression> {
1405 self.codec
1406 }
1407
1408 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1409 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1410 /// been provided.
1411 fn dictionary_enabled(&self) -> Option<bool> {
1412 self.dictionary_enabled
1413 }
1414
1415 /// Returns optional dictionary page size limit for this column.
1416 fn dictionary_page_size_limit(&self) -> Option<usize> {
1417 self.dictionary_page_size_limit
1418 }
1419
1420 /// Returns optional data page size limit for this column.
1421 fn data_page_size_limit(&self) -> Option<usize> {
1422 self.data_page_size_limit
1423 }
1424
1425 /// Returns optional statistics level requested for this column. If result is `None`,
1426 /// then no setting has been provided.
1427 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1428 self.statistics_enabled
1429 }
1430
1431 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1432 /// column.
1433 ///
1434 /// [`Statistics`]: crate::file::statistics::Statistics
1435 fn write_page_header_statistics(&self) -> Option<bool> {
1436 self.write_page_header_statistics
1437 }
1438
1439 /// Returns the bloom filter properties, or `None` if not enabled
1440 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1441 self.bloom_filter_properties.as_ref()
1442 }
1443
1444 /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1445 /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1446 fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1447 if !self.bloom_filter_ndv_is_set {
1448 if let Some(ref mut bf) = self.bloom_filter_properties {
1449 bf.ndv = default_ndv;
1450 }
1451 }
1452 }
1453}
1454
1455/// Reference counted reader properties.
1456pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1457
1458const DEFAULT_READ_BLOOM_FILTER: bool = false;
1459const DEFAULT_READ_PAGE_STATS: bool = false;
1460
1461/// Configuration settings for reading parquet files.
1462///
1463/// All properties are immutable and `Send` + `Sync`.
1464/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1465///
1466/// # Example
1467///
1468/// ```rust
1469/// use parquet::file::properties::ReaderProperties;
1470///
1471/// // Create properties with default configuration.
1472/// let props = ReaderProperties::builder().build();
1473///
1474/// // Use properties builder to set certain options and assemble the configuration.
1475/// let props = ReaderProperties::builder()
1476/// .set_backward_compatible_lz4(false)
1477/// .build();
1478/// ```
1479pub struct ReaderProperties {
1480 codec_options: CodecOptions,
1481 read_bloom_filter: bool,
1482 read_page_stats: bool,
1483}
1484
1485impl ReaderProperties {
1486 /// Returns builder for reader properties with default values.
1487 pub fn builder() -> ReaderPropertiesBuilder {
1488 ReaderPropertiesBuilder::with_defaults()
1489 }
1490
1491 /// Returns codec options.
1492 pub(crate) fn codec_options(&self) -> &CodecOptions {
1493 &self.codec_options
1494 }
1495
1496 /// Returns whether to read bloom filter
1497 pub(crate) fn read_bloom_filter(&self) -> bool {
1498 self.read_bloom_filter
1499 }
1500
1501 /// Returns whether to read page level statistics
1502 pub(crate) fn read_page_stats(&self) -> bool {
1503 self.read_page_stats
1504 }
1505}
1506
1507/// Builder for parquet file reader configuration. See example on
1508/// [`ReaderProperties`]
1509pub struct ReaderPropertiesBuilder {
1510 codec_options_builder: CodecOptionsBuilder,
1511 read_bloom_filter: Option<bool>,
1512 read_page_stats: Option<bool>,
1513}
1514
1515/// Reader properties builder.
1516impl ReaderPropertiesBuilder {
1517 /// Returns default state of the builder.
1518 fn with_defaults() -> Self {
1519 Self {
1520 codec_options_builder: CodecOptionsBuilder::default(),
1521 read_bloom_filter: None,
1522 read_page_stats: None,
1523 }
1524 }
1525
1526 /// Finalizes the configuration and returns immutable reader properties struct.
1527 pub fn build(self) -> ReaderProperties {
1528 ReaderProperties {
1529 codec_options: self.codec_options_builder.build(),
1530 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1531 read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1532 }
1533 }
1534
1535 /// Enable/disable backward compatible LZ4.
1536 ///
1537 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1538 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1539 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1540 /// compatibility with files generated by older versions of parquet-cpp.
1541 ///
1542 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1543 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1544 self.codec_options_builder = self
1545 .codec_options_builder
1546 .set_backward_compatible_lz4(value);
1547 self
1548 }
1549
1550 /// Enable/disable reading bloom filter
1551 ///
1552 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1553 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1554 ///
1555 /// By default bloom filter is set to be read.
1556 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1557 self.read_bloom_filter = Some(value);
1558 self
1559 }
1560
1561 /// Enable/disable reading page-level statistics
1562 ///
1563 /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1564 /// each page, if present.
1565 /// If set to `false`, then the reader will skip decoding the statistics.
1566 ///
1567 /// By default statistics will not be decoded.
1568 ///
1569 /// [`Statistics`]: crate::file::statistics::Statistics
1570 pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1571 self.read_page_stats = Some(value);
1572 self
1573 }
1574}
1575
1576#[cfg(test)]
1577mod tests {
1578 use super::*;
1579
1580 #[test]
1581 fn test_writer_version() {
1582 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1583 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1584 }
1585
1586 #[test]
1587 fn test_writer_properties_default_settings() {
1588 let props = WriterProperties::default();
1589 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1590 assert_eq!(
1591 props.dictionary_page_size_limit(),
1592 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1593 );
1594 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1595 assert_eq!(
1596 props.max_row_group_row_count(),
1597 Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1598 );
1599 assert_eq!(props.max_row_group_bytes(), None);
1600 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1601 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1602 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1603 assert_eq!(props.key_value_metadata(), None);
1604 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1605 assert_eq!(
1606 props.compression(&ColumnPath::from("col")),
1607 DEFAULT_COMPRESSION
1608 );
1609 assert_eq!(
1610 props.dictionary_enabled(&ColumnPath::from("col")),
1611 DEFAULT_DICTIONARY_ENABLED
1612 );
1613 assert_eq!(
1614 props.statistics_enabled(&ColumnPath::from("col")),
1615 DEFAULT_STATISTICS_ENABLED
1616 );
1617 assert!(
1618 props
1619 .bloom_filter_properties(&ColumnPath::from("col"))
1620 .is_none()
1621 );
1622 }
1623
1624 #[test]
1625 fn test_writer_properties_dictionary_encoding() {
1626 // dictionary encoding is not configurable, and it should be the same for both
1627 // writer version 1 and 2.
1628 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1629 let props = WriterProperties::builder()
1630 .set_writer_version(*version)
1631 .build();
1632 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1633 assert_eq!(
1634 props.dictionary_data_page_encoding(),
1635 Encoding::RLE_DICTIONARY
1636 );
1637 }
1638 }
1639
1640 #[test]
1641 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1642 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1643 // Should panic when user specifies dictionary encoding as fallback encoding.
1644 WriterProperties::builder()
1645 .set_encoding(Encoding::PLAIN_DICTIONARY)
1646 .build();
1647 }
1648
1649 #[test]
1650 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1651 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1652 // Should panic when user specifies dictionary encoding as fallback encoding.
1653 WriterProperties::builder()
1654 .set_encoding(Encoding::RLE_DICTIONARY)
1655 .build();
1656 }
1657
1658 #[test]
1659 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1660 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1661 WriterProperties::builder()
1662 .set_dictionary_enabled(true)
1663 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1664 .build();
1665 }
1666
1667 #[test]
1668 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1669 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1670 WriterProperties::builder()
1671 .set_dictionary_enabled(false)
1672 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1673 .build();
1674 }
1675
1676 #[test]
1677 fn test_writer_properties_builder() {
1678 let props = WriterProperties::builder()
1679 // file settings
1680 .set_writer_version(WriterVersion::PARQUET_2_0)
1681 .set_data_page_size_limit(10)
1682 .set_dictionary_page_size_limit(20)
1683 .set_write_batch_size(30)
1684 .set_max_row_group_row_count(Some(40))
1685 .set_created_by("default".to_owned())
1686 .set_key_value_metadata(Some(vec![KeyValue::new(
1687 "key".to_string(),
1688 "value".to_string(),
1689 )]))
1690 // global column settings
1691 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1692 .set_compression(Compression::GZIP(Default::default()))
1693 .set_dictionary_enabled(false)
1694 .set_statistics_enabled(EnabledStatistics::None)
1695 // specific column settings
1696 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1697 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1698 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1699 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1700 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1701 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1702 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1703 .build();
1704
1705 fn test_props(props: &WriterProperties) {
1706 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1707 assert_eq!(props.data_page_size_limit(), 10);
1708 assert_eq!(props.dictionary_page_size_limit(), 20);
1709 assert_eq!(props.write_batch_size(), 30);
1710 assert_eq!(props.max_row_group_row_count(), Some(40));
1711 assert_eq!(props.created_by(), "default");
1712 assert_eq!(
1713 props.key_value_metadata(),
1714 Some(&vec![
1715 KeyValue::new("key".to_string(), "value".to_string(),)
1716 ])
1717 );
1718
1719 assert_eq!(
1720 props.encoding(&ColumnPath::from("a")),
1721 Some(Encoding::DELTA_BINARY_PACKED)
1722 );
1723 assert_eq!(
1724 props.compression(&ColumnPath::from("a")),
1725 Compression::GZIP(Default::default())
1726 );
1727 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1728 assert_eq!(
1729 props.statistics_enabled(&ColumnPath::from("a")),
1730 EnabledStatistics::None
1731 );
1732
1733 assert_eq!(
1734 props.encoding(&ColumnPath::from("col")),
1735 Some(Encoding::RLE)
1736 );
1737 assert_eq!(
1738 props.compression(&ColumnPath::from("col")),
1739 Compression::SNAPPY
1740 );
1741 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1742 assert_eq!(
1743 props.statistics_enabled(&ColumnPath::from("col")),
1744 EnabledStatistics::Chunk
1745 );
1746 assert_eq!(
1747 props.bloom_filter_properties(&ColumnPath::from("col")),
1748 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1749 );
1750 }
1751
1752 // Test direct build of properties
1753 test_props(&props);
1754
1755 // Test that into_builder() gives the same result
1756 let props_into_builder_and_back = props.into_builder().build();
1757 test_props(&props_into_builder_and_back);
1758 }
1759
1760 #[test]
1761 fn test_writer_properties_builder_partial_defaults() {
1762 let props = WriterProperties::builder()
1763 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1764 .set_compression(Compression::GZIP(Default::default()))
1765 .set_bloom_filter_enabled(true)
1766 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1767 .build();
1768
1769 assert_eq!(
1770 props.encoding(&ColumnPath::from("col")),
1771 Some(Encoding::RLE)
1772 );
1773 assert_eq!(
1774 props.compression(&ColumnPath::from("col")),
1775 Compression::GZIP(Default::default())
1776 );
1777 assert_eq!(
1778 props.dictionary_enabled(&ColumnPath::from("col")),
1779 DEFAULT_DICTIONARY_ENABLED
1780 );
1781 assert_eq!(
1782 props.bloom_filter_properties(&ColumnPath::from("col")),
1783 Some(&BloomFilterProperties {
1784 fpp: DEFAULT_BLOOM_FILTER_FPP,
1785 ndv: DEFAULT_BLOOM_FILTER_NDV,
1786 })
1787 );
1788 }
1789
1790 #[test]
1791 #[allow(deprecated)]
1792 fn test_writer_properties_deprecated_max_row_group_size_still_works() {
1793 let props = WriterProperties::builder()
1794 .set_max_row_group_size(42)
1795 .build();
1796
1797 assert_eq!(props.max_row_group_row_count(), Some(42));
1798 assert_eq!(props.max_row_group_size(), 42);
1799 }
1800
1801 #[test]
1802 #[should_panic(expected = "Cannot have a 0 max row group row count")]
1803 fn test_writer_properties_panic_on_zero_row_group_row_count() {
1804 let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
1805 }
1806
1807 #[test]
1808 #[should_panic(expected = "Cannot have a 0 max row group bytes")]
1809 fn test_writer_properties_panic_on_zero_row_group_bytes() {
1810 let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
1811 }
1812
1813 #[test]
1814 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1815 assert_eq!(
1816 WriterProperties::builder()
1817 .build()
1818 .bloom_filter_properties(&ColumnPath::from("col")),
1819 None
1820 );
1821 assert_eq!(
1822 WriterProperties::builder()
1823 .set_bloom_filter_ndv(100)
1824 .build()
1825 .bloom_filter_properties(&ColumnPath::from("col")),
1826 Some(&BloomFilterProperties {
1827 fpp: DEFAULT_BLOOM_FILTER_FPP,
1828 ndv: 100,
1829 })
1830 );
1831 assert_eq!(
1832 WriterProperties::builder()
1833 .set_bloom_filter_fpp(0.1)
1834 .build()
1835 .bloom_filter_properties(&ColumnPath::from("col")),
1836 Some(&BloomFilterProperties {
1837 fpp: 0.1,
1838 ndv: DEFAULT_BLOOM_FILTER_NDV,
1839 })
1840 );
1841 }
1842
1843 #[test]
1844 fn test_writer_properties_column_dictionary_page_size_limit() {
1845 let props = WriterProperties::builder()
1846 .set_dictionary_page_size_limit(100)
1847 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1848 .build();
1849
1850 assert_eq!(props.dictionary_page_size_limit(), 100);
1851 assert_eq!(
1852 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1853 10
1854 );
1855 assert_eq!(
1856 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1857 100
1858 );
1859 }
1860
1861 #[test]
1862 fn test_writer_properties_column_data_page_size_limit() {
1863 let props = WriterProperties::builder()
1864 .set_data_page_size_limit(100)
1865 .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
1866 .build();
1867
1868 assert_eq!(props.data_page_size_limit(), 100);
1869 assert_eq!(
1870 props.column_data_page_size_limit(&ColumnPath::from("col")),
1871 10
1872 );
1873 assert_eq!(
1874 props.column_data_page_size_limit(&ColumnPath::from("other")),
1875 100
1876 );
1877 }
1878
1879 #[test]
1880 fn test_reader_properties_default_settings() {
1881 let props = ReaderProperties::builder().build();
1882
1883 let codec_options = CodecOptionsBuilder::default()
1884 .set_backward_compatible_lz4(true)
1885 .build();
1886
1887 assert_eq!(props.codec_options(), &codec_options);
1888 assert!(!props.read_bloom_filter());
1889 }
1890
1891 #[test]
1892 fn test_reader_properties_builder() {
1893 let props = ReaderProperties::builder()
1894 .set_backward_compatible_lz4(false)
1895 .build();
1896
1897 let codec_options = CodecOptionsBuilder::default()
1898 .set_backward_compatible_lz4(false)
1899 .build();
1900
1901 assert_eq!(props.codec_options(), &codec_options);
1902 }
1903
1904 #[test]
1905 fn test_parse_writerversion() {
1906 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1907 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1908 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1909 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1910
1911 // test lowercase
1912 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1913 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1914
1915 // test invalid version
1916 match "PARQUET_-1_0".parse::<WriterVersion>() {
1917 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1918 Err(e) => {
1919 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1920 }
1921 }
1922 }
1923
1924 #[test]
1925 fn test_parse_enabledstatistics() {
1926 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1927 assert_eq!(enabled_statistics, EnabledStatistics::None);
1928 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1929 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1930 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1931 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1932
1933 // test lowercase
1934 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1935 assert_eq!(enabled_statistics, EnabledStatistics::None);
1936
1937 //test invalid statistics
1938 match "ChunkAndPage".parse::<EnabledStatistics>() {
1939 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1940 Err(e) => {
1941 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1942 }
1943 }
1944 }
1945
1946 #[test]
1947 fn test_cdc_options_equality() {
1948 let opts = CdcOptions::default();
1949 assert_eq!(opts, CdcOptions::default());
1950
1951 let custom = CdcOptions {
1952 min_chunk_size: 1024,
1953 max_chunk_size: 8192,
1954 norm_level: 1,
1955 };
1956 assert_eq!(custom, custom);
1957 assert_ne!(opts, custom);
1958 }
1959}