parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_row_count`]
47pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`].
57///
58/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
59/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
60/// but without an explicit NDV will have their NDV resolved at build time to
61/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
62/// if the user configured a custom row group size.
63pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
64/// Default values for [`WriterProperties::statistics_truncate_length`]
65pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
66/// Default value for [`WriterProperties::offset_index_disabled`]
67pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
68/// Default values for [`WriterProperties::coerce_types`]
69pub const DEFAULT_COERCE_TYPES: bool = false;
70/// Default minimum chunk size for content-defined chunking: 256 KiB.
71pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
72/// Default maximum chunk size for content-defined chunking: 1024 KiB.
73pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024;
74/// Default normalization level for content-defined chunking.
75pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0;
76
77/// EXPERIMENTAL: Options for content-defined chunking (CDC).
78///
79/// Content-defined chunking is an experimental feature that optimizes parquet
80/// files for content addressable storage (CAS) systems by writing data pages
81/// according to content-defined chunk boundaries. This allows for more
82/// efficient deduplication of data across files, hence more efficient network
83/// transfers and storage.
84///
85/// Each content-defined chunk is written as a separate parquet data page. The
86/// following options control the chunks' size and the chunking process. Note
87/// that the chunk size is calculated based on the logical value of the data,
88/// before any encoding or compression is applied.
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub struct CdcOptions {
91 /// Minimum chunk size in bytes, default is 256 KiB.
92 /// The rolling hash will not be updated until this size is reached for each chunk.
93 /// Note that all data sent through the hash function is counted towards the chunk
94 /// size, including definition and repetition levels if present.
95 pub min_chunk_size: usize,
96 /// Maximum chunk size in bytes, default is 1024 KiB.
97 /// The chunker will create a new chunk whenever the chunk size exceeds this value.
98 /// Note that the parquet writer has a related [`data_page_size_limit`] property that
99 /// controls the maximum size of a parquet data page after encoding. While setting
100 /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect
101 /// the chunking effectiveness, it results in more small parquet data pages.
102 ///
103 /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit
104 pub max_chunk_size: usize,
105 /// Number of bit adjustment to the gearhash mask in order to center the chunk size
106 /// around the average size more aggressively, default is 0.
107 /// Increasing the normalization level increases the probability of finding a chunk,
108 /// improving the deduplication ratio, but also increasing the number of small chunks
109 /// resulting in many small parquet data pages. The default value provides a good
110 /// balance between deduplication ratio and fragmentation.
111 /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
112 /// expense of fragmentation. Negative values can also be used to reduce the
113 /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
114 /// Note that values outside [-3, 3] are not recommended, prefer using the default
115 /// value of 0 for most use cases.
116 pub norm_level: i32,
117}
118
119impl Default for CdcOptions {
120 fn default() -> Self {
121 Self {
122 min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE,
123 max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE,
124 norm_level: DEFAULT_CDC_NORM_LEVEL,
125 }
126 }
127}
128
129/// Parquet writer version.
130///
131/// Basic constant, which is not part of the Thrift definition.
132#[derive(Debug, Clone, Copy, PartialEq, Eq)]
133#[allow(non_camel_case_types)]
134pub enum WriterVersion {
135 /// Parquet format version 1.0
136 PARQUET_1_0,
137 /// Parquet format version 2.0
138 PARQUET_2_0,
139}
140
141impl WriterVersion {
142 /// Returns writer version as `i32`.
143 pub fn as_num(&self) -> i32 {
144 match self {
145 WriterVersion::PARQUET_1_0 => 1,
146 WriterVersion::PARQUET_2_0 => 2,
147 }
148 }
149}
150
151impl FromStr for WriterVersion {
152 type Err = String;
153
154 fn from_str(s: &str) -> Result<Self, Self::Err> {
155 match s {
156 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
157 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
158 _ => Err(format!("Invalid writer version: {s}")),
159 }
160 }
161}
162
163/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
164/// write Bloom filters
165///
166/// Basic constant, which is not part of the Thrift definition.
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168pub enum BloomFilterPosition {
169 /// Write Bloom Filters of each row group right after the row group
170 ///
171 /// This saves memory by writing it as soon as it is computed, at the cost
172 /// of data locality for readers
173 AfterRowGroup,
174 /// Write Bloom Filters at the end of the file
175 ///
176 /// This allows better data locality for readers, at the cost of memory usage
177 /// for writers.
178 End,
179}
180
181/// Reference counted writer properties.
182pub type WriterPropertiesPtr = Arc<WriterProperties>;
183
184/// Configuration settings for writing parquet files.
185///
186/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
187///
188/// # Example
189///
190/// ```rust
191/// # use parquet::{
192/// # basic::{Compression, Encoding},
193/// # file::properties::*,
194/// # schema::types::ColumnPath,
195/// # };
196/// #
197/// // Create properties with default configuration.
198/// let props = WriterProperties::default();
199///
200/// // Use properties builder to set certain options and assemble the configuration.
201/// let props = WriterProperties::builder()
202/// .set_writer_version(WriterVersion::PARQUET_1_0)
203/// .set_encoding(Encoding::PLAIN)
204/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
205/// .set_compression(Compression::SNAPPY)
206/// .build();
207///
208/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
209/// assert_eq!(
210/// props.encoding(&ColumnPath::from("col1")),
211/// Some(Encoding::DELTA_BINARY_PACKED)
212/// );
213/// assert_eq!(
214/// props.encoding(&ColumnPath::from("col2")),
215/// Some(Encoding::PLAIN)
216/// );
217/// ```
218#[derive(Debug, Clone)]
219pub struct WriterProperties {
220 data_page_row_count_limit: usize,
221 write_batch_size: usize,
222 max_row_group_row_count: Option<usize>,
223 max_row_group_bytes: Option<usize>,
224 bloom_filter_position: BloomFilterPosition,
225 writer_version: WriterVersion,
226 created_by: String,
227 offset_index_disabled: bool,
228 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
229 default_column_properties: ColumnProperties,
230 column_properties: HashMap<ColumnPath, ColumnProperties>,
231 sorting_columns: Option<Vec<SortingColumn>>,
232 column_index_truncate_length: Option<usize>,
233 statistics_truncate_length: Option<usize>,
234 coerce_types: bool,
235 content_defined_chunking: Option<CdcOptions>,
236 #[cfg(feature = "encryption")]
237 pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
238}
239
240impl Default for WriterProperties {
241 fn default() -> Self {
242 Self::builder().build()
243 }
244}
245
246impl WriterProperties {
247 /// Create a new [`WriterProperties`] with the default settings
248 ///
249 /// See [`WriterProperties::builder`] for customising settings
250 pub fn new() -> Self {
251 Self::default()
252 }
253
254 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
255 /// properties.
256 pub fn builder() -> WriterPropertiesBuilder {
257 WriterPropertiesBuilder::default()
258 }
259
260 /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
261 /// Used for mutating existing property settings
262 pub fn into_builder(self) -> WriterPropertiesBuilder {
263 self.into()
264 }
265
266 /// Returns data page size limit.
267 ///
268 /// Note: this is a best effort limit based on the write batch size
269 ///
270 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
271 pub fn data_page_size_limit(&self) -> usize {
272 self.default_column_properties
273 .data_page_size_limit()
274 .unwrap_or(DEFAULT_PAGE_SIZE)
275 }
276
277 /// Returns data page size limit for a specific column.
278 ///
279 /// Takes precedence over [`Self::data_page_size_limit`].
280 ///
281 /// Note: this is a best effort limit based on the write batch size.
282 pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
283 self.column_properties
284 .get(col)
285 .and_then(|c| c.data_page_size_limit())
286 .or_else(|| self.default_column_properties.data_page_size_limit())
287 .unwrap_or(DEFAULT_PAGE_SIZE)
288 }
289
290 /// Returns dictionary page size limit.
291 ///
292 /// Note: this is a best effort limit based on the write batch size
293 ///
294 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
295 pub fn dictionary_page_size_limit(&self) -> usize {
296 self.default_column_properties
297 .dictionary_page_size_limit()
298 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
299 }
300
301 /// Returns dictionary page size limit for a specific column.
302 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
303 self.column_properties
304 .get(col)
305 .and_then(|c| c.dictionary_page_size_limit())
306 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
307 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
308 }
309
310 /// Returns the maximum page row count
311 ///
312 /// Note: this is a best effort limit based on the write batch size
313 ///
314 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
315 pub fn data_page_row_count_limit(&self) -> usize {
316 self.data_page_row_count_limit
317 }
318
319 /// Returns configured batch size for writes.
320 ///
321 /// When writing a batch of data, this setting allows to split it internally into
322 /// smaller batches so we can better estimate the size of a page currently being
323 /// written.
324 ///
325 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
326 pub fn write_batch_size(&self) -> usize {
327 self.write_batch_size
328 }
329
330 /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
331 ///
332 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
333 #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
334 pub fn max_row_group_size(&self) -> usize {
335 self.max_row_group_row_count.unwrap_or(usize::MAX)
336 }
337
338 /// Returns maximum number of rows in a row group, or `None` if unlimited.
339 ///
340 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
341 pub fn max_row_group_row_count(&self) -> Option<usize> {
342 self.max_row_group_row_count
343 }
344
345 /// Returns maximum size of a row group in bytes, or `None` if unlimited.
346 ///
347 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
348 pub fn max_row_group_bytes(&self) -> Option<usize> {
349 self.max_row_group_bytes
350 }
351
352 /// Returns bloom filter position.
353 ///
354 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
355 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
356 self.bloom_filter_position
357 }
358
359 /// Returns configured writer version.
360 ///
361 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
362 pub fn writer_version(&self) -> WriterVersion {
363 self.writer_version
364 }
365
366 /// Returns `created_by` string.
367 ///
368 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
369 pub fn created_by(&self) -> &str {
370 &self.created_by
371 }
372
373 /// Returns `true` if offset index writing is disabled.
374 ///
375 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
376 pub fn offset_index_disabled(&self) -> bool {
377 // If page statistics are to be collected, then do not disable the offset indexes.
378 let default_page_stats_enabled =
379 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
380 let column_page_stats_enabled = self
381 .column_properties
382 .iter()
383 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
384 if default_page_stats_enabled || column_page_stats_enabled {
385 return false;
386 }
387
388 self.offset_index_disabled
389 }
390
391 /// Returns `key_value_metadata` KeyValue pairs.
392 ///
393 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
394 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
395 self.key_value_metadata.as_ref()
396 }
397
398 /// Returns sorting columns.
399 ///
400 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
401 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
402 self.sorting_columns.as_ref()
403 }
404
405 /// Returns the maximum length of truncated min/max values in the column index.
406 ///
407 /// `None` if truncation is disabled, must be greater than 0 otherwise.
408 ///
409 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
410 pub fn column_index_truncate_length(&self) -> Option<usize> {
411 self.column_index_truncate_length
412 }
413
414 /// Returns the maximum length of truncated min/max values in [`Statistics`].
415 ///
416 /// `None` if truncation is disabled, must be greater than 0 otherwise.
417 ///
418 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
419 ///
420 /// [`Statistics`]: crate::file::statistics::Statistics
421 pub fn statistics_truncate_length(&self) -> Option<usize> {
422 self.statistics_truncate_length
423 }
424
425 /// Returns `true` if type coercion is enabled.
426 ///
427 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
428 pub fn coerce_types(&self) -> bool {
429 self.coerce_types
430 }
431
432 /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
433 ///
434 /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
435 pub fn content_defined_chunking(&self) -> Option<&CdcOptions> {
436 self.content_defined_chunking.as_ref()
437 }
438
439 /// Returns encoding for a data page, when dictionary encoding is enabled.
440 ///
441 /// This is not configurable.
442 #[inline]
443 pub fn dictionary_data_page_encoding(&self) -> Encoding {
444 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
445 // Dictionary values are encoded using RLE_DICTIONARY encoding.
446 Encoding::RLE_DICTIONARY
447 }
448
449 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
450 ///
451 /// This is not configurable.
452 #[inline]
453 pub fn dictionary_page_encoding(&self) -> Encoding {
454 // PLAIN_DICTIONARY is deprecated in writer version 1.
455 // Dictionary is encoded using plain encoding.
456 Encoding::PLAIN
457 }
458
459 /// Returns encoding for a column, if set.
460 ///
461 /// In case when dictionary is enabled, returns fallback encoding.
462 ///
463 /// If encoding is not set, then column writer will choose the best encoding
464 /// based on the column type.
465 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
466 self.column_properties
467 .get(col)
468 .and_then(|c| c.encoding())
469 .or_else(|| self.default_column_properties.encoding())
470 }
471
472 /// Returns compression codec for a column.
473 ///
474 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
475 pub fn compression(&self, col: &ColumnPath) -> Compression {
476 self.column_properties
477 .get(col)
478 .and_then(|c| c.compression())
479 .or_else(|| self.default_column_properties.compression())
480 .unwrap_or(DEFAULT_COMPRESSION)
481 }
482
483 /// Returns `true` if dictionary encoding is enabled for a column.
484 ///
485 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
486 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
487 self.column_properties
488 .get(col)
489 .and_then(|c| c.dictionary_enabled())
490 .or_else(|| self.default_column_properties.dictionary_enabled())
491 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
492 }
493
494 /// Returns which statistics are written for a column.
495 ///
496 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
497 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
498 self.column_properties
499 .get(col)
500 .and_then(|c| c.statistics_enabled())
501 .or_else(|| self.default_column_properties.statistics_enabled())
502 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
503 }
504
505 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
506 ///
507 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
508 ///
509 /// [`Statistics`]: crate::file::statistics::Statistics
510 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
511 self.column_properties
512 .get(col)
513 .and_then(|c| c.write_page_header_statistics())
514 .or_else(|| {
515 self.default_column_properties
516 .write_page_header_statistics()
517 })
518 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
519 }
520
521 /// Returns the [`BloomFilterProperties`] for the given column
522 ///
523 /// Returns `None` if bloom filter is disabled
524 ///
525 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
526 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
527 self.column_properties
528 .get(col)
529 .and_then(|c| c.bloom_filter_properties())
530 .or_else(|| self.default_column_properties.bloom_filter_properties())
531 }
532
533 /// Return file encryption properties
534 ///
535 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
536 #[cfg(feature = "encryption")]
537 pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
538 self.file_encryption_properties.as_ref()
539 }
540}
541
542/// Builder for [`WriterProperties`] Parquet writer configuration.
543///
544/// See example on [`WriterProperties`]
545#[derive(Debug, Clone)]
546pub struct WriterPropertiesBuilder {
547 data_page_row_count_limit: usize,
548 write_batch_size: usize,
549 max_row_group_row_count: Option<usize>,
550 max_row_group_bytes: Option<usize>,
551 bloom_filter_position: BloomFilterPosition,
552 writer_version: WriterVersion,
553 created_by: String,
554 offset_index_disabled: bool,
555 key_value_metadata: Option<Vec<KeyValue>>,
556 default_column_properties: ColumnProperties,
557 column_properties: HashMap<ColumnPath, ColumnProperties>,
558 sorting_columns: Option<Vec<SortingColumn>>,
559 column_index_truncate_length: Option<usize>,
560 statistics_truncate_length: Option<usize>,
561 coerce_types: bool,
562 content_defined_chunking: Option<CdcOptions>,
563 #[cfg(feature = "encryption")]
564 file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
565}
566
567impl Default for WriterPropertiesBuilder {
568 /// Returns default state of the builder.
569 fn default() -> Self {
570 Self {
571 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
572 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
573 max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
574 max_row_group_bytes: None,
575 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
576 writer_version: DEFAULT_WRITER_VERSION,
577 created_by: DEFAULT_CREATED_BY.to_string(),
578 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
579 key_value_metadata: None,
580 default_column_properties: Default::default(),
581 column_properties: HashMap::new(),
582 sorting_columns: None,
583 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
584 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
585 coerce_types: DEFAULT_COERCE_TYPES,
586 content_defined_chunking: None,
587 #[cfg(feature = "encryption")]
588 file_encryption_properties: None,
589 }
590 }
591}
592
593impl WriterPropertiesBuilder {
594 /// Finalizes the configuration and returns immutable writer properties struct.
595 pub fn build(self) -> WriterProperties {
596 // Resolve bloom filter NDV for columns where it wasn't explicitly set:
597 // default to max_row_group_row_count so the filter is never undersized.
598 let default_ndv = self
599 .max_row_group_row_count
600 .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
601 let mut default_column_properties = self.default_column_properties;
602 default_column_properties.resolve_bloom_filter_ndv(default_ndv);
603 let mut column_properties = self.column_properties;
604 for props in column_properties.values_mut() {
605 props.resolve_bloom_filter_ndv(default_ndv);
606 }
607
608 WriterProperties {
609 data_page_row_count_limit: self.data_page_row_count_limit,
610 write_batch_size: self.write_batch_size,
611 max_row_group_row_count: self.max_row_group_row_count,
612 max_row_group_bytes: self.max_row_group_bytes,
613 bloom_filter_position: self.bloom_filter_position,
614 writer_version: self.writer_version,
615 created_by: self.created_by,
616 offset_index_disabled: self.offset_index_disabled,
617 key_value_metadata: self.key_value_metadata,
618 default_column_properties,
619 column_properties,
620 sorting_columns: self.sorting_columns,
621 column_index_truncate_length: self.column_index_truncate_length,
622 statistics_truncate_length: self.statistics_truncate_length,
623 coerce_types: self.coerce_types,
624 content_defined_chunking: self.content_defined_chunking,
625 #[cfg(feature = "encryption")]
626 file_encryption_properties: self.file_encryption_properties,
627 }
628 }
629
630 // ----------------------------------------------------------------------
631 // Writer properties related to a file
632
633 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
634 /// via [`DEFAULT_WRITER_VERSION`])
635 ///
636 /// This value can determine what features some readers will support.
637 ///
638 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
639 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
640 self.writer_version = value;
641 self
642 }
643
644 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
645 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
646 ///
647 /// The parquet writer will attempt to limit the number of rows in
648 /// each `DataPage` to this value. Reducing this value will result
649 /// in larger parquet files, but may improve the effectiveness of
650 /// page index based predicate pushdown during reading.
651 ///
652 /// Note: this is a best effort limit based on value of
653 /// [`set_write_batch_size`](Self::set_write_batch_size).
654 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
655 self.data_page_row_count_limit = value;
656 self
657 }
658
659 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
660 ///
661 /// For performance reasons, data for each column is written in
662 /// batches of this size.
663 ///
664 /// Additional limits such as such as
665 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
666 /// are checked between batches, and thus the write batch size value acts as an
667 /// upper-bound on the enforcement granularity of other limits.
668 pub fn set_write_batch_size(mut self, value: usize) -> Self {
669 self.write_batch_size = value;
670 self
671 }
672
673 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
674 /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
675 ///
676 /// # Panics
677 /// If the value is set to 0.
678 #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
679 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
680 assert!(value > 0, "Cannot have a 0 max row group size");
681 self.max_row_group_row_count = Some(value);
682 self
683 }
684
685 /// Sets maximum number of rows in a row group, or `None` for unlimited.
686 ///
687 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
688 /// the row group with the smaller limit will be produced.
689 ///
690 /// # Panics
691 /// If the value is `Some(0)`.
692 pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
693 assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
694 self.max_row_group_row_count = value;
695 self
696 }
697
698 /// Sets maximum size of a row group in bytes, or `None` for unlimited.
699 ///
700 /// Row groups are flushed when their estimated encoded size exceeds this threshold.
701 /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
702 ///
703 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
704 /// the row group with the smaller limit will be produced.
705 ///
706 /// # Panics
707 /// If the value is `Some(0)`.
708 pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
709 assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
710 self.max_row_group_bytes = value;
711 self
712 }
713
714 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
715 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
716 ///
717 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
718 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
719 self.bloom_filter_position = value;
720 self
721 }
722
723 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
724 /// [`DEFAULT_CREATED_BY`]).
725 ///
726 /// This is a string that will be written into the file metadata
727 pub fn set_created_by(mut self, value: String) -> Self {
728 self.created_by = value;
729 self
730 }
731
732 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
733 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
734 ///
735 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
736 ///
737 /// Note: As the offset indexes are useful for accessing data by row number,
738 /// they are always written by default, regardless of whether other statistics
739 /// are enabled. Disabling this metadata may result in a degradation in read
740 /// performance, so use this option with care.
741 ///
742 /// [`Page`]: EnabledStatistics::Page
743 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
744 self.offset_index_disabled = value;
745 self
746 }
747
748 /// Sets "key_value_metadata" property (defaults to `None`).
749 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
750 self.key_value_metadata = value;
751 self
752 }
753
754 /// Sets sorting order of rows in the row group if any (defaults to `None`).
755 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
756 self.sorting_columns = value;
757 self
758 }
759
760 /// Sets the max length of min/max value fields when writing the column
761 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
762 ///
763 /// This can be used to prevent columns with very long values (hundreds of
764 /// bytes long) from causing the parquet metadata to become huge.
765 ///
766 /// # Notes
767 ///
768 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
769 /// set to [`EnabledStatistics::Page`].
770 ///
771 /// * If `Some`, must be greater than 0, otherwise will panic
772 /// * If `None`, there's no effective limit.
773 ///
774 /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
775 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
776 if let Some(value) = max_length {
777 assert!(
778 value > 0,
779 "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
780 );
781 }
782
783 self.column_index_truncate_length = max_length;
784 self
785 }
786
787 /// Sets the max length of min/max value fields in row group and data page header
788 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
789 ///
790 /// # Notes
791 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
792 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
793 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
794 /// [`EnabledStatistics::Page`].
795 ///
796 /// * If `Some`, must be greater than 0, otherwise will panic
797 /// * If `None`, there's no effective limit.
798 ///
799 /// # See also
800 /// Truncation of Page Index statistics is controlled separately via
801 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
802 ///
803 /// [`Statistics`]: crate::file::statistics::Statistics
804 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
805 if let Some(value) = max_length {
806 assert!(
807 value > 0,
808 "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
809 );
810 }
811
812 self.statistics_truncate_length = max_length;
813 self
814 }
815
816 /// Should the writer coerce types to parquet native types (defaults to `false` via
817 /// [`DEFAULT_COERCE_TYPES`]).
818 ///
819 /// Leaving this option the default `false` will ensure the exact same data
820 /// written to parquet using this library will be read.
821 ///
822 /// Setting this option to `true` will result in parquet files that can be
823 /// read by more readers, but potentially lose information in the process.
824 ///
825 /// * Types such as [`DataType::Date64`], which have no direct corresponding
826 /// Parquet type, may be stored with lower precision.
827 ///
828 /// * The internal field names of `List` and `Map` types will be renamed if
829 /// necessary to match what is required by the newest Parquet specification.
830 ///
831 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
832 ///
833 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
834 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
835 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
836 self.coerce_types = coerce_types;
837 self
838 }
839
840 /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
841 ///
842 /// When enabled, data page boundaries are determined by a rolling hash of the
843 /// column values, so unchanged data produces identical byte sequences across
844 /// file versions. This enables efficient deduplication on content-addressable
845 /// storage systems.
846 ///
847 /// Only supported through the Arrow writer interface ([`ArrowWriter`]).
848 ///
849 /// # Panics
850 ///
851 /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`.
852 ///
853 /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter
854 pub fn set_content_defined_chunking(mut self, options: Option<CdcOptions>) -> Self {
855 if let Some(ref options) = options {
856 assert!(
857 options.min_chunk_size > 0,
858 "min_chunk_size must be positive"
859 );
860 assert!(
861 options.max_chunk_size > options.min_chunk_size,
862 "max_chunk_size ({}) must be greater than min_chunk_size ({})",
863 options.max_chunk_size,
864 options.min_chunk_size
865 );
866 }
867 self.content_defined_chunking = options;
868 self
869 }
870
871 /// Sets FileEncryptionProperties (defaults to `None`)
872 #[cfg(feature = "encryption")]
873 pub fn with_file_encryption_properties(
874 mut self,
875 file_encryption_properties: Arc<FileEncryptionProperties>,
876 ) -> Self {
877 self.file_encryption_properties = Some(file_encryption_properties);
878 self
879 }
880
881 // ----------------------------------------------------------------------
882 // Setters for any column (global)
883
884 /// Sets default encoding for all columns.
885 ///
886 /// If dictionary is not enabled, this is treated as a primary encoding for all
887 /// columns. In case when dictionary is enabled for any column, this value is
888 /// considered to be a fallback encoding for that column.
889 ///
890 /// # Panics
891 ///
892 /// if dictionary encoding is specified, regardless of dictionary
893 /// encoding flag being set.
894 pub fn set_encoding(mut self, value: Encoding) -> Self {
895 self.default_column_properties.set_encoding(value);
896 self
897 }
898
899 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
900 /// [`DEFAULT_COMPRESSION`]).
901 ///
902 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
903 pub fn set_compression(mut self, value: Compression) -> Self {
904 self.default_column_properties.set_compression(value);
905 self
906 }
907
908 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
909 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
910 ///
911 /// Use this method to set dictionary encoding, instead of explicitly specifying
912 /// encoding in `set_encoding` method.
913 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
914 self.default_column_properties.set_dictionary_enabled(value);
915 self
916 }
917
918 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
919 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
920 ///
921 /// The parquet writer will attempt to limit the size of each
922 /// `DataPage` used to store dictionaries to this many
923 /// bytes. Reducing this value will result in larger parquet
924 /// files, but may improve the effectiveness of page index based
925 /// predicate pushdown during reading.
926 ///
927 /// Note: this is a best effort limit based on value of
928 /// [`set_write_batch_size`](Self::set_write_batch_size).
929 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
930 self.default_column_properties
931 .set_dictionary_page_size_limit(value);
932 self
933 }
934
935 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
936 /// via [`DEFAULT_PAGE_SIZE`]).
937 ///
938 /// The parquet writer will attempt to limit the sizes of each
939 /// `DataPage` to this many bytes. Reducing this value will result
940 /// in larger parquet files, but may improve the effectiveness of
941 /// page index based predicate pushdown during reading.
942 ///
943 /// Note: this is a best effort limit based on value of
944 /// [`set_write_batch_size`](Self::set_write_batch_size).
945 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
946 self.default_column_properties
947 .set_data_page_size_limit(value);
948 self
949 }
950
951 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
952 /// [`DEFAULT_STATISTICS_ENABLED`]).
953 ///
954 /// [`Page`]: EnabledStatistics::Page
955 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
956 self.default_column_properties.set_statistics_enabled(value);
957 self
958 }
959
960 /// enable/disable writing [`Statistics`] in the page header
961 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
962 ///
963 /// Only applicable if [`Page`] level statistics are gathered.
964 ///
965 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
966 /// file while yielding very little added benefit. Most modern Parquet implementations
967 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
968 /// those in the page header.
969 ///
970 /// # Note
971 ///
972 /// Prior to version 56.0.0, the `parquet` crate always wrote these
973 /// statistics (the equivalent of setting this option to `true`). This was
974 /// changed in 56.0.0 to follow the recommendation in the Parquet
975 /// specification. See [issue #7580] for more details.
976 ///
977 /// [`Statistics`]: crate::file::statistics::Statistics
978 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
979 /// [`Page`]: EnabledStatistics::Page
980 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
981 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
982 self.default_column_properties
983 .set_write_page_header_statistics(value);
984 self
985 }
986
987 /// Sets if bloom filter should be written for all columns (defaults to `false`).
988 ///
989 /// # Notes
990 ///
991 /// * If the bloom filter is enabled previously then it is a no-op.
992 ///
993 /// * If the bloom filter is not enabled, default values for ndv and fpp
994 /// value are used used. See [`set_bloom_filter_ndv`] and
995 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
996 ///
997 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
998 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
999 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
1000 self.default_column_properties
1001 .set_bloom_filter_enabled(value);
1002 self
1003 }
1004
1005 /// Sets the default target bloom filter false positive probability (fpp)
1006 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
1007 ///
1008 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1009 /// been called.
1010 ///
1011 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1012 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
1013 self.default_column_properties.set_bloom_filter_fpp(value);
1014 self
1015 }
1016
1017 /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1018 /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
1019 ///
1020 /// The bloom filter is initially sized for this many distinct values at the
1021 /// configured FPP, then folded down after all values are inserted to achieve
1022 /// optimal size. A good heuristic is to set this to the expected number of rows
1023 /// in the row group.
1024 ///
1025 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
1026 /// been called.
1027 ///
1028 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1029 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
1030 self.default_column_properties.set_bloom_filter_ndv(value);
1031 self
1032 }
1033
1034 // ----------------------------------------------------------------------
1035 // Setters for a specific column
1036
1037 /// Helper method to get existing or new mutable reference of column properties.
1038 #[inline]
1039 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
1040 self.column_properties.entry(col).or_default()
1041 }
1042
1043 /// Sets encoding for a specific column.
1044 ///
1045 /// Takes precedence over [`Self::set_encoding`].
1046 ///
1047 /// If dictionary is not enabled, this is treated as a primary encoding for this
1048 /// column. In case when dictionary is enabled for this column, either through
1049 /// global defaults or explicitly, this value is considered to be a fallback
1050 /// encoding for this column.
1051 ///
1052 /// # Panics
1053 /// If user tries to set dictionary encoding here, regardless of dictionary
1054 /// encoding flag being set.
1055 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
1056 self.get_mut_props(col).set_encoding(value);
1057 self
1058 }
1059
1060 /// Sets compression codec for a specific column.
1061 ///
1062 /// Takes precedence over [`Self::set_compression`].
1063 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
1064 self.get_mut_props(col).set_compression(value);
1065 self
1066 }
1067
1068 /// Sets flag to enable/disable dictionary encoding for a specific column.
1069 ///
1070 /// Takes precedence over [`Self::set_dictionary_enabled`].
1071 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1072 self.get_mut_props(col).set_dictionary_enabled(value);
1073 self
1074 }
1075
1076 /// Sets dictionary page size limit for a specific column.
1077 ///
1078 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
1079 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1080 self.get_mut_props(col)
1081 .set_dictionary_page_size_limit(value);
1082 self
1083 }
1084
1085 /// Sets data page size limit for a specific column.
1086 ///
1087 /// Takes precedence over [`Self::set_data_page_size_limit`].
1088 pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
1089 self.get_mut_props(col).set_data_page_size_limit(value);
1090 self
1091 }
1092
1093 /// Sets [`EnabledStatistics`] level for a specific column.
1094 ///
1095 /// Takes precedence over [`Self::set_statistics_enabled`].
1096 pub fn set_column_statistics_enabled(
1097 mut self,
1098 col: ColumnPath,
1099 value: EnabledStatistics,
1100 ) -> Self {
1101 self.get_mut_props(col).set_statistics_enabled(value);
1102 self
1103 }
1104
1105 /// Sets whether to write [`Statistics`] in the page header for a specific column.
1106 ///
1107 /// Takes precedence over [`Self::set_write_page_header_statistics`].
1108 ///
1109 /// [`Statistics`]: crate::file::statistics::Statistics
1110 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
1111 self.get_mut_props(col)
1112 .set_write_page_header_statistics(value);
1113 self
1114 }
1115
1116 /// Sets whether a bloom filter should be written for a specific column.
1117 ///
1118 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
1119 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
1120 self.get_mut_props(col).set_bloom_filter_enabled(value);
1121 self
1122 }
1123
1124 /// Sets the false positive probability for bloom filter for a specific column.
1125 ///
1126 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1127 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1128 self.get_mut_props(col).set_bloom_filter_fpp(value);
1129 self
1130 }
1131
1132 /// Sets the number of distinct values for bloom filter for a specific column.
1133 ///
1134 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
1135 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1136 self.get_mut_props(col).set_bloom_filter_ndv(value);
1137 self
1138 }
1139}
1140
1141impl From<WriterProperties> for WriterPropertiesBuilder {
1142 fn from(props: WriterProperties) -> Self {
1143 WriterPropertiesBuilder {
1144 data_page_row_count_limit: props.data_page_row_count_limit,
1145 write_batch_size: props.write_batch_size,
1146 max_row_group_row_count: props.max_row_group_row_count,
1147 max_row_group_bytes: props.max_row_group_bytes,
1148 bloom_filter_position: props.bloom_filter_position,
1149 writer_version: props.writer_version,
1150 created_by: props.created_by,
1151 offset_index_disabled: props.offset_index_disabled,
1152 key_value_metadata: props.key_value_metadata,
1153 default_column_properties: props.default_column_properties,
1154 column_properties: props.column_properties,
1155 sorting_columns: props.sorting_columns,
1156 column_index_truncate_length: props.column_index_truncate_length,
1157 statistics_truncate_length: props.statistics_truncate_length,
1158 coerce_types: props.coerce_types,
1159 content_defined_chunking: props.content_defined_chunking,
1160 #[cfg(feature = "encryption")]
1161 file_encryption_properties: props.file_encryption_properties,
1162 }
1163 }
1164}
1165
1166/// Controls the level of statistics to be computed by the writer and stored in
1167/// the parquet file.
1168///
1169/// Enabling statistics makes the resulting Parquet file larger and requires
1170/// more time to read the parquet footer.
1171///
1172/// Statistics can be used to improve query performance by pruning row groups
1173/// and pages during query execution if the query engine supports evaluating the
1174/// predicate using the statistics.
1175#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1176pub enum EnabledStatistics {
1177 /// Compute no statistics.
1178 None,
1179 /// Compute column chunk-level statistics but not page-level.
1180 ///
1181 /// Setting this option will store one set of statistics for each relevant
1182 /// column for each row group. The more row groups written, the more
1183 /// statistics will be stored.
1184 Chunk,
1185 /// Compute page-level and column chunk-level statistics.
1186 ///
1187 /// Setting this option will store one set of statistics for each relevant
1188 /// column for each row group. In addition, this will enable the writing
1189 /// of the column index (the offset index is always written regardless of
1190 /// this setting). See [`ParquetColumnIndex`] for
1191 /// more information.
1192 ///
1193 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1194 Page,
1195}
1196
1197impl FromStr for EnabledStatistics {
1198 type Err = String;
1199
1200 fn from_str(s: &str) -> Result<Self, Self::Err> {
1201 match s {
1202 "NONE" | "none" => Ok(EnabledStatistics::None),
1203 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1204 "PAGE" | "page" => Ok(EnabledStatistics::Page),
1205 _ => Err(format!("Invalid statistics arg: {s}")),
1206 }
1207 }
1208}
1209
1210impl Default for EnabledStatistics {
1211 fn default() -> Self {
1212 DEFAULT_STATISTICS_ENABLED
1213 }
1214}
1215
1216/// Controls the bloom filter to be computed by the writer.
1217///
1218/// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1219/// automatically folded down after all values are inserted to achieve optimal size while
1220/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1221/// folding algorithm.
1222///
1223/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
1224#[derive(Debug, Clone, PartialEq)]
1225pub struct BloomFilterProperties {
1226 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1227 ///
1228 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1229 ///
1230 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1231 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1232 /// e.g. 0.1, 0.05, or 0.001 is recommended.
1233 ///
1234 /// This value also serves as the target FPP for bloom filter folding: after all values
1235 /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
1236 pub fpp: f64,
1237 /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1238 ///
1239 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1240 ///
1241 /// When not explicitly set via the builder, this defaults to
1242 /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1243 /// build time). The bloom filter is initially sized for this many distinct values at the
1244 /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1245 /// is to set this to the expected number of rows in the row group. If fewer distinct values
1246 /// are actually written, the filter will be automatically compacted via folding.
1247 ///
1248 /// Thus the only negative side of overestimating this value is that the bloom filter
1249 /// will use more memory during writing than necessary, but it will not affect the final
1250 /// bloom filter size on disk.
1251 ///
1252 /// If you wish to reduce memory usage during writing and are able to make a reasonable estimate
1253 /// of the number of distinct values in a row group, it is recommended to set this value explicitly
1254 /// rather than relying on the default dynamic sizing based on `max_row_group_row_count`.
1255 /// If you do set this value explicitly it is probably best to set it for each column
1256 /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
1257 /// since different columns may have different numbers of distinct values.
1258 pub ndv: u64,
1259}
1260
1261impl Default for BloomFilterProperties {
1262 fn default() -> Self {
1263 BloomFilterProperties {
1264 fpp: DEFAULT_BLOOM_FILTER_FPP,
1265 ndv: DEFAULT_BLOOM_FILTER_NDV,
1266 }
1267 }
1268}
1269
1270/// Container for column properties that can be changed as part of writer.
1271///
1272/// If a field is `None`, it means that no specific value has been set for this column,
1273/// so some subsequent or default value must be used.
1274#[derive(Debug, Clone, Default, PartialEq)]
1275struct ColumnProperties {
1276 encoding: Option<Encoding>,
1277 codec: Option<Compression>,
1278 data_page_size_limit: Option<usize>,
1279 dictionary_page_size_limit: Option<usize>,
1280 dictionary_enabled: Option<bool>,
1281 statistics_enabled: Option<EnabledStatistics>,
1282 write_page_header_statistics: Option<bool>,
1283 /// bloom filter related properties
1284 bloom_filter_properties: Option<BloomFilterProperties>,
1285 /// Whether the bloom filter NDV was explicitly set by the user
1286 bloom_filter_ndv_is_set: bool,
1287}
1288
1289impl ColumnProperties {
1290 /// Sets encoding for this column.
1291 ///
1292 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1293 /// In case when dictionary is enabled for a column, this value is considered to
1294 /// be a fallback encoding.
1295 ///
1296 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1297 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1298 /// for a column.
1299 fn set_encoding(&mut self, value: Encoding) {
1300 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1301 panic!("Dictionary encoding can not be used as fallback encoding");
1302 }
1303 self.encoding = Some(value);
1304 }
1305
1306 /// Sets compression codec for this column.
1307 fn set_compression(&mut self, value: Compression) {
1308 self.codec = Some(value);
1309 }
1310
1311 /// Sets data page size limit for this column.
1312 fn set_data_page_size_limit(&mut self, value: usize) {
1313 self.data_page_size_limit = Some(value);
1314 }
1315
1316 /// Sets whether dictionary encoding is enabled for this column.
1317 fn set_dictionary_enabled(&mut self, enabled: bool) {
1318 self.dictionary_enabled = Some(enabled);
1319 }
1320
1321 /// Sets dictionary page size limit for this column.
1322 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1323 self.dictionary_page_size_limit = Some(value);
1324 }
1325
1326 /// Sets the statistics level for this column.
1327 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1328 self.statistics_enabled = Some(enabled);
1329 }
1330
1331 /// Sets whether to write statistics in the page header for this column.
1332 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1333 self.write_page_header_statistics = Some(enabled);
1334 }
1335
1336 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1337 /// otherwise it is a no-op.
1338 /// If `value` is `false`, resets bloom filter properties to `None`.
1339 fn set_bloom_filter_enabled(&mut self, value: bool) {
1340 if value && self.bloom_filter_properties.is_none() {
1341 self.bloom_filter_properties = Some(Default::default())
1342 } else if !value {
1343 self.bloom_filter_properties = None
1344 }
1345 }
1346
1347 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1348 /// bloom filter if not previously enabled.
1349 ///
1350 /// # Panics
1351 ///
1352 /// Panics if the `value` is not between 0 and 1 exclusive
1353 fn set_bloom_filter_fpp(&mut self, value: f64) {
1354 assert!(
1355 value > 0. && value < 1.0,
1356 "fpp must be between 0 and 1 exclusive, got {value}"
1357 );
1358
1359 self.bloom_filter_properties
1360 .get_or_insert_with(Default::default)
1361 .fpp = value;
1362 }
1363
1364 /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1365 /// column, and implicitly enables bloom filter if not previously enabled.
1366 fn set_bloom_filter_ndv(&mut self, value: u64) {
1367 self.bloom_filter_properties
1368 .get_or_insert_with(Default::default)
1369 .ndv = value;
1370 self.bloom_filter_ndv_is_set = true;
1371 }
1372
1373 /// Returns optional encoding for this column.
1374 fn encoding(&self) -> Option<Encoding> {
1375 self.encoding
1376 }
1377
1378 /// Returns optional compression codec for this column.
1379 fn compression(&self) -> Option<Compression> {
1380 self.codec
1381 }
1382
1383 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1384 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1385 /// been provided.
1386 fn dictionary_enabled(&self) -> Option<bool> {
1387 self.dictionary_enabled
1388 }
1389
1390 /// Returns optional dictionary page size limit for this column.
1391 fn dictionary_page_size_limit(&self) -> Option<usize> {
1392 self.dictionary_page_size_limit
1393 }
1394
1395 /// Returns optional data page size limit for this column.
1396 fn data_page_size_limit(&self) -> Option<usize> {
1397 self.data_page_size_limit
1398 }
1399
1400 /// Returns optional statistics level requested for this column. If result is `None`,
1401 /// then no setting has been provided.
1402 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1403 self.statistics_enabled
1404 }
1405
1406 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1407 /// column.
1408 ///
1409 /// [`Statistics`]: crate::file::statistics::Statistics
1410 fn write_page_header_statistics(&self) -> Option<bool> {
1411 self.write_page_header_statistics
1412 }
1413
1414 /// Returns the bloom filter properties, or `None` if not enabled
1415 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1416 self.bloom_filter_properties.as_ref()
1417 }
1418
1419 /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1420 /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1421 fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1422 if !self.bloom_filter_ndv_is_set {
1423 if let Some(ref mut bf) = self.bloom_filter_properties {
1424 bf.ndv = default_ndv;
1425 }
1426 }
1427 }
1428}
1429
1430/// Reference counted reader properties.
1431pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1432
1433const DEFAULT_READ_BLOOM_FILTER: bool = false;
1434const DEFAULT_READ_PAGE_STATS: bool = false;
1435
1436/// Configuration settings for reading parquet files.
1437///
1438/// All properties are immutable and `Send` + `Sync`.
1439/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1440///
1441/// # Example
1442///
1443/// ```rust
1444/// use parquet::file::properties::ReaderProperties;
1445///
1446/// // Create properties with default configuration.
1447/// let props = ReaderProperties::builder().build();
1448///
1449/// // Use properties builder to set certain options and assemble the configuration.
1450/// let props = ReaderProperties::builder()
1451/// .set_backward_compatible_lz4(false)
1452/// .build();
1453/// ```
1454pub struct ReaderProperties {
1455 codec_options: CodecOptions,
1456 read_bloom_filter: bool,
1457 read_page_stats: bool,
1458}
1459
1460impl ReaderProperties {
1461 /// Returns builder for reader properties with default values.
1462 pub fn builder() -> ReaderPropertiesBuilder {
1463 ReaderPropertiesBuilder::with_defaults()
1464 }
1465
1466 /// Returns codec options.
1467 pub(crate) fn codec_options(&self) -> &CodecOptions {
1468 &self.codec_options
1469 }
1470
1471 /// Returns whether to read bloom filter
1472 pub(crate) fn read_bloom_filter(&self) -> bool {
1473 self.read_bloom_filter
1474 }
1475
1476 /// Returns whether to read page level statistics
1477 pub(crate) fn read_page_stats(&self) -> bool {
1478 self.read_page_stats
1479 }
1480}
1481
1482/// Builder for parquet file reader configuration. See example on
1483/// [`ReaderProperties`]
1484pub struct ReaderPropertiesBuilder {
1485 codec_options_builder: CodecOptionsBuilder,
1486 read_bloom_filter: Option<bool>,
1487 read_page_stats: Option<bool>,
1488}
1489
1490/// Reader properties builder.
1491impl ReaderPropertiesBuilder {
1492 /// Returns default state of the builder.
1493 fn with_defaults() -> Self {
1494 Self {
1495 codec_options_builder: CodecOptionsBuilder::default(),
1496 read_bloom_filter: None,
1497 read_page_stats: None,
1498 }
1499 }
1500
1501 /// Finalizes the configuration and returns immutable reader properties struct.
1502 pub fn build(self) -> ReaderProperties {
1503 ReaderProperties {
1504 codec_options: self.codec_options_builder.build(),
1505 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1506 read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1507 }
1508 }
1509
1510 /// Enable/disable backward compatible LZ4.
1511 ///
1512 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1513 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1514 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1515 /// compatibility with files generated by older versions of parquet-cpp.
1516 ///
1517 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1518 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1519 self.codec_options_builder = self
1520 .codec_options_builder
1521 .set_backward_compatible_lz4(value);
1522 self
1523 }
1524
1525 /// Enable/disable reading bloom filter
1526 ///
1527 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1528 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1529 ///
1530 /// By default bloom filter is set to be read.
1531 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1532 self.read_bloom_filter = Some(value);
1533 self
1534 }
1535
1536 /// Enable/disable reading page-level statistics
1537 ///
1538 /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1539 /// each page, if present.
1540 /// If set to `false`, then the reader will skip decoding the statistics.
1541 ///
1542 /// By default statistics will not be decoded.
1543 ///
1544 /// [`Statistics`]: crate::file::statistics::Statistics
1545 pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1546 self.read_page_stats = Some(value);
1547 self
1548 }
1549}
1550
1551#[cfg(test)]
1552mod tests {
1553 use super::*;
1554
1555 #[test]
1556 fn test_writer_version() {
1557 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1558 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1559 }
1560
1561 #[test]
1562 fn test_writer_properties_default_settings() {
1563 let props = WriterProperties::default();
1564 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1565 assert_eq!(
1566 props.dictionary_page_size_limit(),
1567 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1568 );
1569 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1570 assert_eq!(
1571 props.max_row_group_row_count(),
1572 Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1573 );
1574 assert_eq!(props.max_row_group_bytes(), None);
1575 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1576 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1577 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1578 assert_eq!(props.key_value_metadata(), None);
1579 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1580 assert_eq!(
1581 props.compression(&ColumnPath::from("col")),
1582 DEFAULT_COMPRESSION
1583 );
1584 assert_eq!(
1585 props.dictionary_enabled(&ColumnPath::from("col")),
1586 DEFAULT_DICTIONARY_ENABLED
1587 );
1588 assert_eq!(
1589 props.statistics_enabled(&ColumnPath::from("col")),
1590 DEFAULT_STATISTICS_ENABLED
1591 );
1592 assert!(
1593 props
1594 .bloom_filter_properties(&ColumnPath::from("col"))
1595 .is_none()
1596 );
1597 }
1598
1599 #[test]
1600 fn test_writer_properties_dictionary_encoding() {
1601 // dictionary encoding is not configurable, and it should be the same for both
1602 // writer version 1 and 2.
1603 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1604 let props = WriterProperties::builder()
1605 .set_writer_version(*version)
1606 .build();
1607 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1608 assert_eq!(
1609 props.dictionary_data_page_encoding(),
1610 Encoding::RLE_DICTIONARY
1611 );
1612 }
1613 }
1614
1615 #[test]
1616 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1617 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1618 // Should panic when user specifies dictionary encoding as fallback encoding.
1619 WriterProperties::builder()
1620 .set_encoding(Encoding::PLAIN_DICTIONARY)
1621 .build();
1622 }
1623
1624 #[test]
1625 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1626 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1627 // Should panic when user specifies dictionary encoding as fallback encoding.
1628 WriterProperties::builder()
1629 .set_encoding(Encoding::RLE_DICTIONARY)
1630 .build();
1631 }
1632
1633 #[test]
1634 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1635 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1636 WriterProperties::builder()
1637 .set_dictionary_enabled(true)
1638 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1639 .build();
1640 }
1641
1642 #[test]
1643 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1644 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1645 WriterProperties::builder()
1646 .set_dictionary_enabled(false)
1647 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1648 .build();
1649 }
1650
1651 #[test]
1652 fn test_writer_properties_builder() {
1653 let props = WriterProperties::builder()
1654 // file settings
1655 .set_writer_version(WriterVersion::PARQUET_2_0)
1656 .set_data_page_size_limit(10)
1657 .set_dictionary_page_size_limit(20)
1658 .set_write_batch_size(30)
1659 .set_max_row_group_row_count(Some(40))
1660 .set_created_by("default".to_owned())
1661 .set_key_value_metadata(Some(vec![KeyValue::new(
1662 "key".to_string(),
1663 "value".to_string(),
1664 )]))
1665 // global column settings
1666 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1667 .set_compression(Compression::GZIP(Default::default()))
1668 .set_dictionary_enabled(false)
1669 .set_statistics_enabled(EnabledStatistics::None)
1670 // specific column settings
1671 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1672 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1673 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1674 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1675 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1676 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1677 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1678 .build();
1679
1680 fn test_props(props: &WriterProperties) {
1681 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1682 assert_eq!(props.data_page_size_limit(), 10);
1683 assert_eq!(props.dictionary_page_size_limit(), 20);
1684 assert_eq!(props.write_batch_size(), 30);
1685 assert_eq!(props.max_row_group_row_count(), Some(40));
1686 assert_eq!(props.created_by(), "default");
1687 assert_eq!(
1688 props.key_value_metadata(),
1689 Some(&vec![
1690 KeyValue::new("key".to_string(), "value".to_string(),)
1691 ])
1692 );
1693
1694 assert_eq!(
1695 props.encoding(&ColumnPath::from("a")),
1696 Some(Encoding::DELTA_BINARY_PACKED)
1697 );
1698 assert_eq!(
1699 props.compression(&ColumnPath::from("a")),
1700 Compression::GZIP(Default::default())
1701 );
1702 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1703 assert_eq!(
1704 props.statistics_enabled(&ColumnPath::from("a")),
1705 EnabledStatistics::None
1706 );
1707
1708 assert_eq!(
1709 props.encoding(&ColumnPath::from("col")),
1710 Some(Encoding::RLE)
1711 );
1712 assert_eq!(
1713 props.compression(&ColumnPath::from("col")),
1714 Compression::SNAPPY
1715 );
1716 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1717 assert_eq!(
1718 props.statistics_enabled(&ColumnPath::from("col")),
1719 EnabledStatistics::Chunk
1720 );
1721 assert_eq!(
1722 props.bloom_filter_properties(&ColumnPath::from("col")),
1723 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1724 );
1725 }
1726
1727 // Test direct build of properties
1728 test_props(&props);
1729
1730 // Test that into_builder() gives the same result
1731 let props_into_builder_and_back = props.into_builder().build();
1732 test_props(&props_into_builder_and_back);
1733 }
1734
1735 #[test]
1736 fn test_writer_properties_builder_partial_defaults() {
1737 let props = WriterProperties::builder()
1738 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1739 .set_compression(Compression::GZIP(Default::default()))
1740 .set_bloom_filter_enabled(true)
1741 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1742 .build();
1743
1744 assert_eq!(
1745 props.encoding(&ColumnPath::from("col")),
1746 Some(Encoding::RLE)
1747 );
1748 assert_eq!(
1749 props.compression(&ColumnPath::from("col")),
1750 Compression::GZIP(Default::default())
1751 );
1752 assert_eq!(
1753 props.dictionary_enabled(&ColumnPath::from("col")),
1754 DEFAULT_DICTIONARY_ENABLED
1755 );
1756 assert_eq!(
1757 props.bloom_filter_properties(&ColumnPath::from("col")),
1758 Some(&BloomFilterProperties {
1759 fpp: DEFAULT_BLOOM_FILTER_FPP,
1760 ndv: DEFAULT_BLOOM_FILTER_NDV,
1761 })
1762 );
1763 }
1764
1765 #[test]
1766 #[allow(deprecated)]
1767 fn test_writer_properties_deprecated_max_row_group_size_still_works() {
1768 let props = WriterProperties::builder()
1769 .set_max_row_group_size(42)
1770 .build();
1771
1772 assert_eq!(props.max_row_group_row_count(), Some(42));
1773 assert_eq!(props.max_row_group_size(), 42);
1774 }
1775
1776 #[test]
1777 #[should_panic(expected = "Cannot have a 0 max row group row count")]
1778 fn test_writer_properties_panic_on_zero_row_group_row_count() {
1779 let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
1780 }
1781
1782 #[test]
1783 #[should_panic(expected = "Cannot have a 0 max row group bytes")]
1784 fn test_writer_properties_panic_on_zero_row_group_bytes() {
1785 let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
1786 }
1787
1788 #[test]
1789 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1790 assert_eq!(
1791 WriterProperties::builder()
1792 .build()
1793 .bloom_filter_properties(&ColumnPath::from("col")),
1794 None
1795 );
1796 assert_eq!(
1797 WriterProperties::builder()
1798 .set_bloom_filter_ndv(100)
1799 .build()
1800 .bloom_filter_properties(&ColumnPath::from("col")),
1801 Some(&BloomFilterProperties {
1802 fpp: DEFAULT_BLOOM_FILTER_FPP,
1803 ndv: 100,
1804 })
1805 );
1806 assert_eq!(
1807 WriterProperties::builder()
1808 .set_bloom_filter_fpp(0.1)
1809 .build()
1810 .bloom_filter_properties(&ColumnPath::from("col")),
1811 Some(&BloomFilterProperties {
1812 fpp: 0.1,
1813 ndv: DEFAULT_BLOOM_FILTER_NDV,
1814 })
1815 );
1816 }
1817
1818 #[test]
1819 fn test_writer_properties_column_dictionary_page_size_limit() {
1820 let props = WriterProperties::builder()
1821 .set_dictionary_page_size_limit(100)
1822 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1823 .build();
1824
1825 assert_eq!(props.dictionary_page_size_limit(), 100);
1826 assert_eq!(
1827 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1828 10
1829 );
1830 assert_eq!(
1831 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1832 100
1833 );
1834 }
1835
1836 #[test]
1837 fn test_writer_properties_column_data_page_size_limit() {
1838 let props = WriterProperties::builder()
1839 .set_data_page_size_limit(100)
1840 .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
1841 .build();
1842
1843 assert_eq!(props.data_page_size_limit(), 100);
1844 assert_eq!(
1845 props.column_data_page_size_limit(&ColumnPath::from("col")),
1846 10
1847 );
1848 assert_eq!(
1849 props.column_data_page_size_limit(&ColumnPath::from("other")),
1850 100
1851 );
1852 }
1853
1854 #[test]
1855 fn test_reader_properties_default_settings() {
1856 let props = ReaderProperties::builder().build();
1857
1858 let codec_options = CodecOptionsBuilder::default()
1859 .set_backward_compatible_lz4(true)
1860 .build();
1861
1862 assert_eq!(props.codec_options(), &codec_options);
1863 assert!(!props.read_bloom_filter());
1864 }
1865
1866 #[test]
1867 fn test_reader_properties_builder() {
1868 let props = ReaderProperties::builder()
1869 .set_backward_compatible_lz4(false)
1870 .build();
1871
1872 let codec_options = CodecOptionsBuilder::default()
1873 .set_backward_compatible_lz4(false)
1874 .build();
1875
1876 assert_eq!(props.codec_options(), &codec_options);
1877 }
1878
1879 #[test]
1880 fn test_parse_writerversion() {
1881 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1882 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1883 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1884 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1885
1886 // test lowercase
1887 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1888 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1889
1890 // test invalid version
1891 match "PARQUET_-1_0".parse::<WriterVersion>() {
1892 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1893 Err(e) => {
1894 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1895 }
1896 }
1897 }
1898
1899 #[test]
1900 fn test_parse_enabledstatistics() {
1901 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1902 assert_eq!(enabled_statistics, EnabledStatistics::None);
1903 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1904 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1905 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1906 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1907
1908 // test lowercase
1909 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1910 assert_eq!(enabled_statistics, EnabledStatistics::None);
1911
1912 //test invalid statistics
1913 match "ChunkAndPage".parse::<EnabledStatistics>() {
1914 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1915 Err(e) => {
1916 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1917 }
1918 }
1919 }
1920
1921 #[test]
1922 fn test_cdc_options_equality() {
1923 let opts = CdcOptions::default();
1924 assert_eq!(opts, CdcOptions::default());
1925
1926 let custom = CdcOptions {
1927 min_chunk_size: 1024,
1928 max_chunk_size: 8192,
1929 norm_level: 1,
1930 };
1931 assert_eq!(custom, custom);
1932 assert_ne!(opts, custom);
1933 }
1934}