parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_row_count`]
47pub const DEFAULT_MAX_ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
60/// Default value for [`WriterProperties::offset_index_disabled`]
61pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
62/// Default values for [`WriterProperties::coerce_types`]
63pub const DEFAULT_COERCE_TYPES: bool = false;
64
65/// Parquet writer version.
66///
67/// Basic constant, which is not part of the Thrift definition.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69#[allow(non_camel_case_types)]
70pub enum WriterVersion {
71 /// Parquet format version 1.0
72 PARQUET_1_0,
73 /// Parquet format version 2.0
74 PARQUET_2_0,
75}
76
77impl WriterVersion {
78 /// Returns writer version as `i32`.
79 pub fn as_num(&self) -> i32 {
80 match self {
81 WriterVersion::PARQUET_1_0 => 1,
82 WriterVersion::PARQUET_2_0 => 2,
83 }
84 }
85}
86
87impl FromStr for WriterVersion {
88 type Err = String;
89
90 fn from_str(s: &str) -> Result<Self, Self::Err> {
91 match s {
92 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
93 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
94 _ => Err(format!("Invalid writer version: {s}")),
95 }
96 }
97}
98
99/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
100/// write Bloom filters
101///
102/// Basic constant, which is not part of the Thrift definition.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum BloomFilterPosition {
105 /// Write Bloom Filters of each row group right after the row group
106 ///
107 /// This saves memory by writing it as soon as it is computed, at the cost
108 /// of data locality for readers
109 AfterRowGroup,
110 /// Write Bloom Filters at the end of the file
111 ///
112 /// This allows better data locality for readers, at the cost of memory usage
113 /// for writers.
114 End,
115}
116
117/// Reference counted writer properties.
118pub type WriterPropertiesPtr = Arc<WriterProperties>;
119
120/// Configuration settings for writing parquet files.
121///
122/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
123///
124/// # Example
125///
126/// ```rust
127/// # use parquet::{
128/// # basic::{Compression, Encoding},
129/// # file::properties::*,
130/// # schema::types::ColumnPath,
131/// # };
132/// #
133/// // Create properties with default configuration.
134/// let props = WriterProperties::default();
135///
136/// // Use properties builder to set certain options and assemble the configuration.
137/// let props = WriterProperties::builder()
138/// .set_writer_version(WriterVersion::PARQUET_1_0)
139/// .set_encoding(Encoding::PLAIN)
140/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
141/// .set_compression(Compression::SNAPPY)
142/// .build();
143///
144/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
145/// assert_eq!(
146/// props.encoding(&ColumnPath::from("col1")),
147/// Some(Encoding::DELTA_BINARY_PACKED)
148/// );
149/// assert_eq!(
150/// props.encoding(&ColumnPath::from("col2")),
151/// Some(Encoding::PLAIN)
152/// );
153/// ```
154#[derive(Debug, Clone)]
155pub struct WriterProperties {
156 data_page_row_count_limit: usize,
157 write_batch_size: usize,
158 max_row_group_row_count: Option<usize>,
159 max_row_group_bytes: Option<usize>,
160 bloom_filter_position: BloomFilterPosition,
161 writer_version: WriterVersion,
162 created_by: String,
163 offset_index_disabled: bool,
164 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
165 default_column_properties: ColumnProperties,
166 column_properties: HashMap<ColumnPath, ColumnProperties>,
167 sorting_columns: Option<Vec<SortingColumn>>,
168 column_index_truncate_length: Option<usize>,
169 statistics_truncate_length: Option<usize>,
170 coerce_types: bool,
171 #[cfg(feature = "encryption")]
172 pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
173}
174
175impl Default for WriterProperties {
176 fn default() -> Self {
177 Self::builder().build()
178 }
179}
180
181impl WriterProperties {
182 /// Create a new [`WriterProperties`] with the default settings
183 ///
184 /// See [`WriterProperties::builder`] for customising settings
185 pub fn new() -> Self {
186 Self::default()
187 }
188
189 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
190 /// properties.
191 pub fn builder() -> WriterPropertiesBuilder {
192 WriterPropertiesBuilder::default()
193 }
194
195 /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
196 /// Used for mutating existing property settings
197 pub fn into_builder(self) -> WriterPropertiesBuilder {
198 self.into()
199 }
200
201 /// Returns data page size limit.
202 ///
203 /// Note: this is a best effort limit based on the write batch size
204 ///
205 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
206 pub fn data_page_size_limit(&self) -> usize {
207 self.default_column_properties
208 .data_page_size_limit()
209 .unwrap_or(DEFAULT_PAGE_SIZE)
210 }
211
212 /// Returns data page size limit for a specific column.
213 ///
214 /// Takes precedence over [`Self::data_page_size_limit`].
215 ///
216 /// Note: this is a best effort limit based on the write batch size.
217 pub fn column_data_page_size_limit(&self, col: &ColumnPath) -> usize {
218 self.column_properties
219 .get(col)
220 .and_then(|c| c.data_page_size_limit())
221 .or_else(|| self.default_column_properties.data_page_size_limit())
222 .unwrap_or(DEFAULT_PAGE_SIZE)
223 }
224
225 /// Returns dictionary page size limit.
226 ///
227 /// Note: this is a best effort limit based on the write batch size
228 ///
229 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
230 pub fn dictionary_page_size_limit(&self) -> usize {
231 self.default_column_properties
232 .dictionary_page_size_limit()
233 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
234 }
235
236 /// Returns dictionary page size limit for a specific column.
237 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
238 self.column_properties
239 .get(col)
240 .and_then(|c| c.dictionary_page_size_limit())
241 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
242 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
243 }
244
245 /// Returns the maximum page row count
246 ///
247 /// Note: this is a best effort limit based on the write batch size
248 ///
249 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
250 pub fn data_page_row_count_limit(&self) -> usize {
251 self.data_page_row_count_limit
252 }
253
254 /// Returns configured batch size for writes.
255 ///
256 /// When writing a batch of data, this setting allows to split it internally into
257 /// smaller batches so we can better estimate the size of a page currently being
258 /// written.
259 ///
260 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
261 pub fn write_batch_size(&self) -> usize {
262 self.write_batch_size
263 }
264
265 /// Returns maximum number of rows in a row group, or `usize::MAX` if unlimited.
266 ///
267 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
268 #[deprecated(since = "58.0.0", note = "Use `max_row_group_row_count` instead")]
269 pub fn max_row_group_size(&self) -> usize {
270 self.max_row_group_row_count.unwrap_or(usize::MAX)
271 }
272
273 /// Returns maximum number of rows in a row group, or `None` if unlimited.
274 ///
275 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_row_count`]
276 pub fn max_row_group_row_count(&self) -> Option<usize> {
277 self.max_row_group_row_count
278 }
279
280 /// Returns maximum size of a row group in bytes, or `None` if unlimited.
281 ///
282 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_bytes`]
283 pub fn max_row_group_bytes(&self) -> Option<usize> {
284 self.max_row_group_bytes
285 }
286
287 /// Returns bloom filter position.
288 ///
289 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
290 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
291 self.bloom_filter_position
292 }
293
294 /// Returns configured writer version.
295 ///
296 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
297 pub fn writer_version(&self) -> WriterVersion {
298 self.writer_version
299 }
300
301 /// Returns `created_by` string.
302 ///
303 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
304 pub fn created_by(&self) -> &str {
305 &self.created_by
306 }
307
308 /// Returns `true` if offset index writing is disabled.
309 ///
310 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
311 pub fn offset_index_disabled(&self) -> bool {
312 // If page statistics are to be collected, then do not disable the offset indexes.
313 let default_page_stats_enabled =
314 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
315 let column_page_stats_enabled = self
316 .column_properties
317 .iter()
318 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
319 if default_page_stats_enabled || column_page_stats_enabled {
320 return false;
321 }
322
323 self.offset_index_disabled
324 }
325
326 /// Returns `key_value_metadata` KeyValue pairs.
327 ///
328 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
329 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
330 self.key_value_metadata.as_ref()
331 }
332
333 /// Returns sorting columns.
334 ///
335 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
336 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
337 self.sorting_columns.as_ref()
338 }
339
340 /// Returns the maximum length of truncated min/max values in the column index.
341 ///
342 /// `None` if truncation is disabled, must be greater than 0 otherwise.
343 ///
344 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
345 pub fn column_index_truncate_length(&self) -> Option<usize> {
346 self.column_index_truncate_length
347 }
348
349 /// Returns the maximum length of truncated min/max values in [`Statistics`].
350 ///
351 /// `None` if truncation is disabled, must be greater than 0 otherwise.
352 ///
353 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
354 ///
355 /// [`Statistics`]: crate::file::statistics::Statistics
356 pub fn statistics_truncate_length(&self) -> Option<usize> {
357 self.statistics_truncate_length
358 }
359
360 /// Returns `true` if type coercion is enabled.
361 ///
362 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
363 pub fn coerce_types(&self) -> bool {
364 self.coerce_types
365 }
366
367 /// Returns encoding for a data page, when dictionary encoding is enabled.
368 ///
369 /// This is not configurable.
370 #[inline]
371 pub fn dictionary_data_page_encoding(&self) -> Encoding {
372 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
373 // Dictionary values are encoded using RLE_DICTIONARY encoding.
374 Encoding::RLE_DICTIONARY
375 }
376
377 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
378 ///
379 /// This is not configurable.
380 #[inline]
381 pub fn dictionary_page_encoding(&self) -> Encoding {
382 // PLAIN_DICTIONARY is deprecated in writer version 1.
383 // Dictionary is encoded using plain encoding.
384 Encoding::PLAIN
385 }
386
387 /// Returns encoding for a column, if set.
388 ///
389 /// In case when dictionary is enabled, returns fallback encoding.
390 ///
391 /// If encoding is not set, then column writer will choose the best encoding
392 /// based on the column type.
393 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
394 self.column_properties
395 .get(col)
396 .and_then(|c| c.encoding())
397 .or_else(|| self.default_column_properties.encoding())
398 }
399
400 /// Returns compression codec for a column.
401 ///
402 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
403 pub fn compression(&self, col: &ColumnPath) -> Compression {
404 self.column_properties
405 .get(col)
406 .and_then(|c| c.compression())
407 .or_else(|| self.default_column_properties.compression())
408 .unwrap_or(DEFAULT_COMPRESSION)
409 }
410
411 /// Returns `true` if dictionary encoding is enabled for a column.
412 ///
413 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
414 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
415 self.column_properties
416 .get(col)
417 .and_then(|c| c.dictionary_enabled())
418 .or_else(|| self.default_column_properties.dictionary_enabled())
419 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
420 }
421
422 /// Returns which statistics are written for a column.
423 ///
424 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
425 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
426 self.column_properties
427 .get(col)
428 .and_then(|c| c.statistics_enabled())
429 .or_else(|| self.default_column_properties.statistics_enabled())
430 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
431 }
432
433 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
434 ///
435 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
436 ///
437 /// [`Statistics`]: crate::file::statistics::Statistics
438 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
439 self.column_properties
440 .get(col)
441 .and_then(|c| c.write_page_header_statistics())
442 .or_else(|| {
443 self.default_column_properties
444 .write_page_header_statistics()
445 })
446 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
447 }
448
449 /// Returns the [`BloomFilterProperties`] for the given column
450 ///
451 /// Returns `None` if bloom filter is disabled
452 ///
453 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
454 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
455 self.column_properties
456 .get(col)
457 .and_then(|c| c.bloom_filter_properties())
458 .or_else(|| self.default_column_properties.bloom_filter_properties())
459 }
460
461 /// Return file encryption properties
462 ///
463 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
464 #[cfg(feature = "encryption")]
465 pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
466 self.file_encryption_properties.as_ref()
467 }
468}
469
470/// Builder for [`WriterProperties`] Parquet writer configuration.
471///
472/// See example on [`WriterProperties`]
473#[derive(Debug, Clone)]
474pub struct WriterPropertiesBuilder {
475 data_page_row_count_limit: usize,
476 write_batch_size: usize,
477 max_row_group_row_count: Option<usize>,
478 max_row_group_bytes: Option<usize>,
479 bloom_filter_position: BloomFilterPosition,
480 writer_version: WriterVersion,
481 created_by: String,
482 offset_index_disabled: bool,
483 key_value_metadata: Option<Vec<KeyValue>>,
484 default_column_properties: ColumnProperties,
485 column_properties: HashMap<ColumnPath, ColumnProperties>,
486 sorting_columns: Option<Vec<SortingColumn>>,
487 column_index_truncate_length: Option<usize>,
488 statistics_truncate_length: Option<usize>,
489 coerce_types: bool,
490 #[cfg(feature = "encryption")]
491 file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
492}
493
494impl Default for WriterPropertiesBuilder {
495 /// Returns default state of the builder.
496 fn default() -> Self {
497 Self {
498 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
499 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
500 max_row_group_row_count: Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
501 max_row_group_bytes: None,
502 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
503 writer_version: DEFAULT_WRITER_VERSION,
504 created_by: DEFAULT_CREATED_BY.to_string(),
505 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
506 key_value_metadata: None,
507 default_column_properties: Default::default(),
508 column_properties: HashMap::new(),
509 sorting_columns: None,
510 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
511 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
512 coerce_types: DEFAULT_COERCE_TYPES,
513 #[cfg(feature = "encryption")]
514 file_encryption_properties: None,
515 }
516 }
517}
518
519impl WriterPropertiesBuilder {
520 /// Finalizes the configuration and returns immutable writer properties struct.
521 pub fn build(self) -> WriterProperties {
522 WriterProperties {
523 data_page_row_count_limit: self.data_page_row_count_limit,
524 write_batch_size: self.write_batch_size,
525 max_row_group_row_count: self.max_row_group_row_count,
526 max_row_group_bytes: self.max_row_group_bytes,
527 bloom_filter_position: self.bloom_filter_position,
528 writer_version: self.writer_version,
529 created_by: self.created_by,
530 offset_index_disabled: self.offset_index_disabled,
531 key_value_metadata: self.key_value_metadata,
532 default_column_properties: self.default_column_properties,
533 column_properties: self.column_properties,
534 sorting_columns: self.sorting_columns,
535 column_index_truncate_length: self.column_index_truncate_length,
536 statistics_truncate_length: self.statistics_truncate_length,
537 coerce_types: self.coerce_types,
538 #[cfg(feature = "encryption")]
539 file_encryption_properties: self.file_encryption_properties,
540 }
541 }
542
543 // ----------------------------------------------------------------------
544 // Writer properties related to a file
545
546 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
547 /// via [`DEFAULT_WRITER_VERSION`])
548 ///
549 /// This value can determine what features some readers will support.
550 ///
551 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
552 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
553 self.writer_version = value;
554 self
555 }
556
557 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
558 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
559 ///
560 /// The parquet writer will attempt to limit the number of rows in
561 /// each `DataPage` to this value. Reducing this value will result
562 /// in larger parquet files, but may improve the effectiveness of
563 /// page index based predicate pushdown during reading.
564 ///
565 /// Note: this is a best effort limit based on value of
566 /// [`set_write_batch_size`](Self::set_write_batch_size).
567 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
568 self.data_page_row_count_limit = value;
569 self
570 }
571
572 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
573 ///
574 /// For performance reasons, data for each column is written in
575 /// batches of this size.
576 ///
577 /// Additional limits such as such as
578 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
579 /// are checked between batches, and thus the write batch size value acts as an
580 /// upper-bound on the enforcement granularity of other limits.
581 pub fn set_write_batch_size(mut self, value: usize) -> Self {
582 self.write_batch_size = value;
583 self
584 }
585
586 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
587 /// via [`DEFAULT_MAX_ROW_GROUP_ROW_COUNT`]).
588 ///
589 /// # Panics
590 /// If the value is set to 0.
591 #[deprecated(since = "58.0.0", note = "Use `set_max_row_group_row_count` instead")]
592 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
593 assert!(value > 0, "Cannot have a 0 max row group size");
594 self.max_row_group_row_count = Some(value);
595 self
596 }
597
598 /// Sets maximum number of rows in a row group, or `None` for unlimited.
599 ///
600 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
601 /// the row group with the smaller limit will be produced.
602 ///
603 /// # Panics
604 /// If the value is `Some(0)`.
605 pub fn set_max_row_group_row_count(mut self, value: Option<usize>) -> Self {
606 assert_ne!(value, Some(0), "Cannot have a 0 max row group row count");
607 self.max_row_group_row_count = value;
608 self
609 }
610
611 /// Sets maximum size of a row group in bytes, or `None` for unlimited.
612 ///
613 /// Row groups are flushed when their estimated encoded size exceeds this threshold.
614 /// This is similar to the official Java implementation for `parquet.block.size`'s behavior.
615 ///
616 /// If both `max_row_group_row_count` and `max_row_group_bytes` are set,
617 /// the row group with the smaller limit will be produced.
618 ///
619 /// # Panics
620 /// If the value is `Some(0)`.
621 pub fn set_max_row_group_bytes(mut self, value: Option<usize>) -> Self {
622 assert_ne!(value, Some(0), "Cannot have a 0 max row group bytes");
623 self.max_row_group_bytes = value;
624 self
625 }
626
627 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
628 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
629 ///
630 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
631 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
632 self.bloom_filter_position = value;
633 self
634 }
635
636 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
637 /// [`DEFAULT_CREATED_BY`]).
638 ///
639 /// This is a string that will be written into the file metadata
640 pub fn set_created_by(mut self, value: String) -> Self {
641 self.created_by = value;
642 self
643 }
644
645 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
646 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
647 ///
648 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
649 ///
650 /// Note: As the offset indexes are useful for accessing data by row number,
651 /// they are always written by default, regardless of whether other statistics
652 /// are enabled. Disabling this metadata may result in a degradation in read
653 /// performance, so use this option with care.
654 ///
655 /// [`Page`]: EnabledStatistics::Page
656 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
657 self.offset_index_disabled = value;
658 self
659 }
660
661 /// Sets "key_value_metadata" property (defaults to `None`).
662 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
663 self.key_value_metadata = value;
664 self
665 }
666
667 /// Sets sorting order of rows in the row group if any (defaults to `None`).
668 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
669 self.sorting_columns = value;
670 self
671 }
672
673 /// Sets the max length of min/max value fields when writing the column
674 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
675 ///
676 /// This can be used to prevent columns with very long values (hundreds of
677 /// bytes long) from causing the parquet metadata to become huge.
678 ///
679 /// # Notes
680 ///
681 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
682 /// set to [`EnabledStatistics::Page`].
683 ///
684 /// * If `Some`, must be greater than 0, otherwise will panic
685 /// * If `None`, there's no effective limit.
686 ///
687 /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
688 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
689 if let Some(value) = max_length {
690 assert!(
691 value > 0,
692 "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
693 );
694 }
695
696 self.column_index_truncate_length = max_length;
697 self
698 }
699
700 /// Sets the max length of min/max value fields in row group and data page header
701 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
702 ///
703 /// # Notes
704 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
705 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
706 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
707 /// [`EnabledStatistics::Page`].
708 ///
709 /// * If `Some`, must be greater than 0, otherwise will panic
710 /// * If `None`, there's no effective limit.
711 ///
712 /// # See also
713 /// Truncation of Page Index statistics is controlled separately via
714 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
715 ///
716 /// [`Statistics`]: crate::file::statistics::Statistics
717 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
718 if let Some(value) = max_length {
719 assert!(
720 value > 0,
721 "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
722 );
723 }
724
725 self.statistics_truncate_length = max_length;
726 self
727 }
728
729 /// Should the writer coerce types to parquet native types (defaults to `false` via
730 /// [`DEFAULT_COERCE_TYPES`]).
731 ///
732 /// Leaving this option the default `false` will ensure the exact same data
733 /// written to parquet using this library will be read.
734 ///
735 /// Setting this option to `true` will result in parquet files that can be
736 /// read by more readers, but potentially lose information in the process.
737 ///
738 /// * Types such as [`DataType::Date64`], which have no direct corresponding
739 /// Parquet type, may be stored with lower precision.
740 ///
741 /// * The internal field names of `List` and `Map` types will be renamed if
742 /// necessary to match what is required by the newest Parquet specification.
743 ///
744 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
745 ///
746 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
747 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
748 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
749 self.coerce_types = coerce_types;
750 self
751 }
752
753 /// Sets FileEncryptionProperties (defaults to `None`)
754 #[cfg(feature = "encryption")]
755 pub fn with_file_encryption_properties(
756 mut self,
757 file_encryption_properties: Arc<FileEncryptionProperties>,
758 ) -> Self {
759 self.file_encryption_properties = Some(file_encryption_properties);
760 self
761 }
762
763 // ----------------------------------------------------------------------
764 // Setters for any column (global)
765
766 /// Sets default encoding for all columns.
767 ///
768 /// If dictionary is not enabled, this is treated as a primary encoding for all
769 /// columns. In case when dictionary is enabled for any column, this value is
770 /// considered to be a fallback encoding for that column.
771 ///
772 /// # Panics
773 ///
774 /// if dictionary encoding is specified, regardless of dictionary
775 /// encoding flag being set.
776 pub fn set_encoding(mut self, value: Encoding) -> Self {
777 self.default_column_properties.set_encoding(value);
778 self
779 }
780
781 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
782 /// [`DEFAULT_COMPRESSION`]).
783 ///
784 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
785 pub fn set_compression(mut self, value: Compression) -> Self {
786 self.default_column_properties.set_compression(value);
787 self
788 }
789
790 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
791 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
792 ///
793 /// Use this method to set dictionary encoding, instead of explicitly specifying
794 /// encoding in `set_encoding` method.
795 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
796 self.default_column_properties.set_dictionary_enabled(value);
797 self
798 }
799
800 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
801 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
802 ///
803 /// The parquet writer will attempt to limit the size of each
804 /// `DataPage` used to store dictionaries to this many
805 /// bytes. Reducing this value will result in larger parquet
806 /// files, but may improve the effectiveness of page index based
807 /// predicate pushdown during reading.
808 ///
809 /// Note: this is a best effort limit based on value of
810 /// [`set_write_batch_size`](Self::set_write_batch_size).
811 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
812 self.default_column_properties
813 .set_dictionary_page_size_limit(value);
814 self
815 }
816
817 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
818 /// via [`DEFAULT_PAGE_SIZE`]).
819 ///
820 /// The parquet writer will attempt to limit the sizes of each
821 /// `DataPage` to this many bytes. Reducing this value will result
822 /// in larger parquet files, but may improve the effectiveness of
823 /// page index based predicate pushdown during reading.
824 ///
825 /// Note: this is a best effort limit based on value of
826 /// [`set_write_batch_size`](Self::set_write_batch_size).
827 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
828 self.default_column_properties
829 .set_data_page_size_limit(value);
830 self
831 }
832
833 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
834 /// [`DEFAULT_STATISTICS_ENABLED`]).
835 ///
836 /// [`Page`]: EnabledStatistics::Page
837 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
838 self.default_column_properties.set_statistics_enabled(value);
839 self
840 }
841
842 /// enable/disable writing [`Statistics`] in the page header
843 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
844 ///
845 /// Only applicable if [`Page`] level statistics are gathered.
846 ///
847 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
848 /// file while yielding very little added benefit. Most modern Parquet implementations
849 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
850 /// those in the page header.
851 ///
852 /// # Note
853 ///
854 /// Prior to version 56.0.0, the `parquet` crate always wrote these
855 /// statistics (the equivalent of setting this option to `true`). This was
856 /// changed in 56.0.0 to follow the recommendation in the Parquet
857 /// specification. See [issue #7580] for more details.
858 ///
859 /// [`Statistics`]: crate::file::statistics::Statistics
860 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
861 /// [`Page`]: EnabledStatistics::Page
862 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
863 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
864 self.default_column_properties
865 .set_write_page_header_statistics(value);
866 self
867 }
868
869 /// Sets if bloom filter should be written for all columns (defaults to `false`).
870 ///
871 /// # Notes
872 ///
873 /// * If the bloom filter is enabled previously then it is a no-op.
874 ///
875 /// * If the bloom filter is not enabled, default values for ndv and fpp
876 /// value are used used. See [`set_bloom_filter_ndv`] and
877 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
878 ///
879 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
880 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
881 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
882 self.default_column_properties
883 .set_bloom_filter_enabled(value);
884 self
885 }
886
887 /// Sets the default target bloom filter false positive probability (fpp)
888 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
889 ///
890 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
891 /// been called.
892 ///
893 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
894 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
895 self.default_column_properties.set_bloom_filter_fpp(value);
896 self
897 }
898
899 /// Sets default number of distinct values (ndv) for bloom filter for all
900 /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
901 ///
902 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
903 /// been called.
904 ///
905 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
906 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
907 self.default_column_properties.set_bloom_filter_ndv(value);
908 self
909 }
910
911 // ----------------------------------------------------------------------
912 // Setters for a specific column
913
914 /// Helper method to get existing or new mutable reference of column properties.
915 #[inline]
916 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
917 self.column_properties.entry(col).or_default()
918 }
919
920 /// Sets encoding for a specific column.
921 ///
922 /// Takes precedence over [`Self::set_encoding`].
923 ///
924 /// If dictionary is not enabled, this is treated as a primary encoding for this
925 /// column. In case when dictionary is enabled for this column, either through
926 /// global defaults or explicitly, this value is considered to be a fallback
927 /// encoding for this column.
928 ///
929 /// # Panics
930 /// If user tries to set dictionary encoding here, regardless of dictionary
931 /// encoding flag being set.
932 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
933 self.get_mut_props(col).set_encoding(value);
934 self
935 }
936
937 /// Sets compression codec for a specific column.
938 ///
939 /// Takes precedence over [`Self::set_compression`].
940 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
941 self.get_mut_props(col).set_compression(value);
942 self
943 }
944
945 /// Sets flag to enable/disable dictionary encoding for a specific column.
946 ///
947 /// Takes precedence over [`Self::set_dictionary_enabled`].
948 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
949 self.get_mut_props(col).set_dictionary_enabled(value);
950 self
951 }
952
953 /// Sets dictionary page size limit for a specific column.
954 ///
955 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
956 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
957 self.get_mut_props(col)
958 .set_dictionary_page_size_limit(value);
959 self
960 }
961
962 /// Sets data page size limit for a specific column.
963 ///
964 /// Takes precedence over [`Self::set_data_page_size_limit`].
965 pub fn set_column_data_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
966 self.get_mut_props(col).set_data_page_size_limit(value);
967 self
968 }
969
970 /// Sets [`EnabledStatistics`] level for a specific column.
971 ///
972 /// Takes precedence over [`Self::set_statistics_enabled`].
973 pub fn set_column_statistics_enabled(
974 mut self,
975 col: ColumnPath,
976 value: EnabledStatistics,
977 ) -> Self {
978 self.get_mut_props(col).set_statistics_enabled(value);
979 self
980 }
981
982 /// Sets whether to write [`Statistics`] in the page header for a specific column.
983 ///
984 /// Takes precedence over [`Self::set_write_page_header_statistics`].
985 ///
986 /// [`Statistics`]: crate::file::statistics::Statistics
987 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
988 self.get_mut_props(col)
989 .set_write_page_header_statistics(value);
990 self
991 }
992
993 /// Sets whether a bloom filter should be written for a specific column.
994 ///
995 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
996 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
997 self.get_mut_props(col).set_bloom_filter_enabled(value);
998 self
999 }
1000
1001 /// Sets the false positive probability for bloom filter for a specific column.
1002 ///
1003 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
1004 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
1005 self.get_mut_props(col).set_bloom_filter_fpp(value);
1006 self
1007 }
1008
1009 /// Sets the number of distinct values for bloom filter for a specific column.
1010 ///
1011 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
1012 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
1013 self.get_mut_props(col).set_bloom_filter_ndv(value);
1014 self
1015 }
1016}
1017
1018impl From<WriterProperties> for WriterPropertiesBuilder {
1019 fn from(props: WriterProperties) -> Self {
1020 WriterPropertiesBuilder {
1021 data_page_row_count_limit: props.data_page_row_count_limit,
1022 write_batch_size: props.write_batch_size,
1023 max_row_group_row_count: props.max_row_group_row_count,
1024 max_row_group_bytes: props.max_row_group_bytes,
1025 bloom_filter_position: props.bloom_filter_position,
1026 writer_version: props.writer_version,
1027 created_by: props.created_by,
1028 offset_index_disabled: props.offset_index_disabled,
1029 key_value_metadata: props.key_value_metadata,
1030 default_column_properties: props.default_column_properties,
1031 column_properties: props.column_properties,
1032 sorting_columns: props.sorting_columns,
1033 column_index_truncate_length: props.column_index_truncate_length,
1034 statistics_truncate_length: props.statistics_truncate_length,
1035 coerce_types: props.coerce_types,
1036 #[cfg(feature = "encryption")]
1037 file_encryption_properties: props.file_encryption_properties,
1038 }
1039 }
1040}
1041
1042/// Controls the level of statistics to be computed by the writer and stored in
1043/// the parquet file.
1044///
1045/// Enabling statistics makes the resulting Parquet file larger and requires
1046/// more time to read the parquet footer.
1047///
1048/// Statistics can be used to improve query performance by pruning row groups
1049/// and pages during query execution if the query engine supports evaluating the
1050/// predicate using the statistics.
1051#[derive(Debug, Clone, Copy, Eq, PartialEq)]
1052pub enum EnabledStatistics {
1053 /// Compute no statistics.
1054 None,
1055 /// Compute column chunk-level statistics but not page-level.
1056 ///
1057 /// Setting this option will store one set of statistics for each relevant
1058 /// column for each row group. The more row groups written, the more
1059 /// statistics will be stored.
1060 Chunk,
1061 /// Compute page-level and column chunk-level statistics.
1062 ///
1063 /// Setting this option will store one set of statistics for each relevant
1064 /// column for each row group. In addition, this will enable the writing
1065 /// of the column index (the offset index is always written regardless of
1066 /// this setting). See [`ParquetColumnIndex`] for
1067 /// more information.
1068 ///
1069 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1070 Page,
1071}
1072
1073impl FromStr for EnabledStatistics {
1074 type Err = String;
1075
1076 fn from_str(s: &str) -> Result<Self, Self::Err> {
1077 match s {
1078 "NONE" | "none" => Ok(EnabledStatistics::None),
1079 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1080 "PAGE" | "page" => Ok(EnabledStatistics::Page),
1081 _ => Err(format!("Invalid statistics arg: {s}")),
1082 }
1083 }
1084}
1085
1086impl Default for EnabledStatistics {
1087 fn default() -> Self {
1088 DEFAULT_STATISTICS_ENABLED
1089 }
1090}
1091
1092/// Controls the bloom filter to be computed by the writer.
1093#[derive(Debug, Clone, PartialEq)]
1094pub struct BloomFilterProperties {
1095 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1096 ///
1097 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1098 ///
1099 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1100 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1101 /// e.g. 0.1, 0.05, or 0.001 is recommended.
1102 ///
1103 /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
1104 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1105 /// be known in advance to greatly reduce space usage.
1106 pub fpp: f64,
1107 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1108 ///
1109 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1110 ///
1111 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1112 /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1113 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1114 /// anyway.
1115 ///
1116 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1117 pub ndv: u64,
1118}
1119
1120impl Default for BloomFilterProperties {
1121 fn default() -> Self {
1122 BloomFilterProperties {
1123 fpp: DEFAULT_BLOOM_FILTER_FPP,
1124 ndv: DEFAULT_BLOOM_FILTER_NDV,
1125 }
1126 }
1127}
1128
1129/// Container for column properties that can be changed as part of writer.
1130///
1131/// If a field is `None`, it means that no specific value has been set for this column,
1132/// so some subsequent or default value must be used.
1133#[derive(Debug, Clone, Default, PartialEq)]
1134struct ColumnProperties {
1135 encoding: Option<Encoding>,
1136 codec: Option<Compression>,
1137 data_page_size_limit: Option<usize>,
1138 dictionary_page_size_limit: Option<usize>,
1139 dictionary_enabled: Option<bool>,
1140 statistics_enabled: Option<EnabledStatistics>,
1141 write_page_header_statistics: Option<bool>,
1142 /// bloom filter related properties
1143 bloom_filter_properties: Option<BloomFilterProperties>,
1144}
1145
1146impl ColumnProperties {
1147 /// Sets encoding for this column.
1148 ///
1149 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1150 /// In case when dictionary is enabled for a column, this value is considered to
1151 /// be a fallback encoding.
1152 ///
1153 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1154 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1155 /// for a column.
1156 fn set_encoding(&mut self, value: Encoding) {
1157 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1158 panic!("Dictionary encoding can not be used as fallback encoding");
1159 }
1160 self.encoding = Some(value);
1161 }
1162
1163 /// Sets compression codec for this column.
1164 fn set_compression(&mut self, value: Compression) {
1165 self.codec = Some(value);
1166 }
1167
1168 /// Sets data page size limit for this column.
1169 fn set_data_page_size_limit(&mut self, value: usize) {
1170 self.data_page_size_limit = Some(value);
1171 }
1172
1173 /// Sets whether dictionary encoding is enabled for this column.
1174 fn set_dictionary_enabled(&mut self, enabled: bool) {
1175 self.dictionary_enabled = Some(enabled);
1176 }
1177
1178 /// Sets dictionary page size limit for this column.
1179 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1180 self.dictionary_page_size_limit = Some(value);
1181 }
1182
1183 /// Sets the statistics level for this column.
1184 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1185 self.statistics_enabled = Some(enabled);
1186 }
1187
1188 /// Sets whether to write statistics in the page header for this column.
1189 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1190 self.write_page_header_statistics = Some(enabled);
1191 }
1192
1193 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1194 /// otherwise it is a no-op.
1195 /// If `value` is `false`, resets bloom filter properties to `None`.
1196 fn set_bloom_filter_enabled(&mut self, value: bool) {
1197 if value && self.bloom_filter_properties.is_none() {
1198 self.bloom_filter_properties = Some(Default::default())
1199 } else if !value {
1200 self.bloom_filter_properties = None
1201 }
1202 }
1203
1204 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1205 /// bloom filter if not previously enabled.
1206 ///
1207 /// # Panics
1208 ///
1209 /// Panics if the `value` is not between 0 and 1 exclusive
1210 fn set_bloom_filter_fpp(&mut self, value: f64) {
1211 assert!(
1212 value > 0. && value < 1.0,
1213 "fpp must be between 0 and 1 exclusive, got {value}"
1214 );
1215
1216 self.bloom_filter_properties
1217 .get_or_insert_with(Default::default)
1218 .fpp = value;
1219 }
1220
1221 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1222 /// enables bloom filter if not previously enabled.
1223 fn set_bloom_filter_ndv(&mut self, value: u64) {
1224 self.bloom_filter_properties
1225 .get_or_insert_with(Default::default)
1226 .ndv = value;
1227 }
1228
1229 /// Returns optional encoding for this column.
1230 fn encoding(&self) -> Option<Encoding> {
1231 self.encoding
1232 }
1233
1234 /// Returns optional compression codec for this column.
1235 fn compression(&self) -> Option<Compression> {
1236 self.codec
1237 }
1238
1239 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1240 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1241 /// been provided.
1242 fn dictionary_enabled(&self) -> Option<bool> {
1243 self.dictionary_enabled
1244 }
1245
1246 /// Returns optional dictionary page size limit for this column.
1247 fn dictionary_page_size_limit(&self) -> Option<usize> {
1248 self.dictionary_page_size_limit
1249 }
1250
1251 /// Returns optional data page size limit for this column.
1252 fn data_page_size_limit(&self) -> Option<usize> {
1253 self.data_page_size_limit
1254 }
1255
1256 /// Returns optional statistics level requested for this column. If result is `None`,
1257 /// then no setting has been provided.
1258 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1259 self.statistics_enabled
1260 }
1261
1262 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1263 /// column.
1264 ///
1265 /// [`Statistics`]: crate::file::statistics::Statistics
1266 fn write_page_header_statistics(&self) -> Option<bool> {
1267 self.write_page_header_statistics
1268 }
1269
1270 /// Returns the bloom filter properties, or `None` if not enabled
1271 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1272 self.bloom_filter_properties.as_ref()
1273 }
1274}
1275
1276/// Reference counted reader properties.
1277pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1278
1279const DEFAULT_READ_BLOOM_FILTER: bool = false;
1280const DEFAULT_READ_PAGE_STATS: bool = false;
1281
1282/// Configuration settings for reading parquet files.
1283///
1284/// All properties are immutable and `Send` + `Sync`.
1285/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1286///
1287/// # Example
1288///
1289/// ```rust
1290/// use parquet::file::properties::ReaderProperties;
1291///
1292/// // Create properties with default configuration.
1293/// let props = ReaderProperties::builder().build();
1294///
1295/// // Use properties builder to set certain options and assemble the configuration.
1296/// let props = ReaderProperties::builder()
1297/// .set_backward_compatible_lz4(false)
1298/// .build();
1299/// ```
1300pub struct ReaderProperties {
1301 codec_options: CodecOptions,
1302 read_bloom_filter: bool,
1303 read_page_stats: bool,
1304}
1305
1306impl ReaderProperties {
1307 /// Returns builder for reader properties with default values.
1308 pub fn builder() -> ReaderPropertiesBuilder {
1309 ReaderPropertiesBuilder::with_defaults()
1310 }
1311
1312 /// Returns codec options.
1313 pub(crate) fn codec_options(&self) -> &CodecOptions {
1314 &self.codec_options
1315 }
1316
1317 /// Returns whether to read bloom filter
1318 pub(crate) fn read_bloom_filter(&self) -> bool {
1319 self.read_bloom_filter
1320 }
1321
1322 /// Returns whether to read page level statistics
1323 pub(crate) fn read_page_stats(&self) -> bool {
1324 self.read_page_stats
1325 }
1326}
1327
1328/// Builder for parquet file reader configuration. See example on
1329/// [`ReaderProperties`]
1330pub struct ReaderPropertiesBuilder {
1331 codec_options_builder: CodecOptionsBuilder,
1332 read_bloom_filter: Option<bool>,
1333 read_page_stats: Option<bool>,
1334}
1335
1336/// Reader properties builder.
1337impl ReaderPropertiesBuilder {
1338 /// Returns default state of the builder.
1339 fn with_defaults() -> Self {
1340 Self {
1341 codec_options_builder: CodecOptionsBuilder::default(),
1342 read_bloom_filter: None,
1343 read_page_stats: None,
1344 }
1345 }
1346
1347 /// Finalizes the configuration and returns immutable reader properties struct.
1348 pub fn build(self) -> ReaderProperties {
1349 ReaderProperties {
1350 codec_options: self.codec_options_builder.build(),
1351 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1352 read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1353 }
1354 }
1355
1356 /// Enable/disable backward compatible LZ4.
1357 ///
1358 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1359 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1360 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1361 /// compatibility with files generated by older versions of parquet-cpp.
1362 ///
1363 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1364 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1365 self.codec_options_builder = self
1366 .codec_options_builder
1367 .set_backward_compatible_lz4(value);
1368 self
1369 }
1370
1371 /// Enable/disable reading bloom filter
1372 ///
1373 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1374 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1375 ///
1376 /// By default bloom filter is set to be read.
1377 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1378 self.read_bloom_filter = Some(value);
1379 self
1380 }
1381
1382 /// Enable/disable reading page-level statistics
1383 ///
1384 /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1385 /// each page, if present.
1386 /// If set to `false`, then the reader will skip decoding the statistics.
1387 ///
1388 /// By default statistics will not be decoded.
1389 ///
1390 /// [`Statistics`]: crate::file::statistics::Statistics
1391 pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1392 self.read_page_stats = Some(value);
1393 self
1394 }
1395}
1396
1397#[cfg(test)]
1398mod tests {
1399 use super::*;
1400
1401 #[test]
1402 fn test_writer_version() {
1403 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1404 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1405 }
1406
1407 #[test]
1408 fn test_writer_properties_default_settings() {
1409 let props = WriterProperties::default();
1410 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1411 assert_eq!(
1412 props.dictionary_page_size_limit(),
1413 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1414 );
1415 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1416 assert_eq!(
1417 props.max_row_group_row_count(),
1418 Some(DEFAULT_MAX_ROW_GROUP_ROW_COUNT)
1419 );
1420 assert_eq!(props.max_row_group_bytes(), None);
1421 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1422 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1423 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1424 assert_eq!(props.key_value_metadata(), None);
1425 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1426 assert_eq!(
1427 props.compression(&ColumnPath::from("col")),
1428 DEFAULT_COMPRESSION
1429 );
1430 assert_eq!(
1431 props.dictionary_enabled(&ColumnPath::from("col")),
1432 DEFAULT_DICTIONARY_ENABLED
1433 );
1434 assert_eq!(
1435 props.statistics_enabled(&ColumnPath::from("col")),
1436 DEFAULT_STATISTICS_ENABLED
1437 );
1438 assert!(
1439 props
1440 .bloom_filter_properties(&ColumnPath::from("col"))
1441 .is_none()
1442 );
1443 }
1444
1445 #[test]
1446 fn test_writer_properties_dictionary_encoding() {
1447 // dictionary encoding is not configurable, and it should be the same for both
1448 // writer version 1 and 2.
1449 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1450 let props = WriterProperties::builder()
1451 .set_writer_version(*version)
1452 .build();
1453 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1454 assert_eq!(
1455 props.dictionary_data_page_encoding(),
1456 Encoding::RLE_DICTIONARY
1457 );
1458 }
1459 }
1460
1461 #[test]
1462 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1463 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1464 // Should panic when user specifies dictionary encoding as fallback encoding.
1465 WriterProperties::builder()
1466 .set_encoding(Encoding::PLAIN_DICTIONARY)
1467 .build();
1468 }
1469
1470 #[test]
1471 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1472 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1473 // Should panic when user specifies dictionary encoding as fallback encoding.
1474 WriterProperties::builder()
1475 .set_encoding(Encoding::RLE_DICTIONARY)
1476 .build();
1477 }
1478
1479 #[test]
1480 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1481 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1482 WriterProperties::builder()
1483 .set_dictionary_enabled(true)
1484 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1485 .build();
1486 }
1487
1488 #[test]
1489 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1490 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1491 WriterProperties::builder()
1492 .set_dictionary_enabled(false)
1493 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1494 .build();
1495 }
1496
1497 #[test]
1498 fn test_writer_properties_builder() {
1499 let props = WriterProperties::builder()
1500 // file settings
1501 .set_writer_version(WriterVersion::PARQUET_2_0)
1502 .set_data_page_size_limit(10)
1503 .set_dictionary_page_size_limit(20)
1504 .set_write_batch_size(30)
1505 .set_max_row_group_row_count(Some(40))
1506 .set_created_by("default".to_owned())
1507 .set_key_value_metadata(Some(vec![KeyValue::new(
1508 "key".to_string(),
1509 "value".to_string(),
1510 )]))
1511 // global column settings
1512 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1513 .set_compression(Compression::GZIP(Default::default()))
1514 .set_dictionary_enabled(false)
1515 .set_statistics_enabled(EnabledStatistics::None)
1516 // specific column settings
1517 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1518 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1519 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1520 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1521 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1522 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1523 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1524 .build();
1525
1526 fn test_props(props: &WriterProperties) {
1527 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1528 assert_eq!(props.data_page_size_limit(), 10);
1529 assert_eq!(props.dictionary_page_size_limit(), 20);
1530 assert_eq!(props.write_batch_size(), 30);
1531 assert_eq!(props.max_row_group_row_count(), Some(40));
1532 assert_eq!(props.created_by(), "default");
1533 assert_eq!(
1534 props.key_value_metadata(),
1535 Some(&vec![
1536 KeyValue::new("key".to_string(), "value".to_string(),)
1537 ])
1538 );
1539
1540 assert_eq!(
1541 props.encoding(&ColumnPath::from("a")),
1542 Some(Encoding::DELTA_BINARY_PACKED)
1543 );
1544 assert_eq!(
1545 props.compression(&ColumnPath::from("a")),
1546 Compression::GZIP(Default::default())
1547 );
1548 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1549 assert_eq!(
1550 props.statistics_enabled(&ColumnPath::from("a")),
1551 EnabledStatistics::None
1552 );
1553
1554 assert_eq!(
1555 props.encoding(&ColumnPath::from("col")),
1556 Some(Encoding::RLE)
1557 );
1558 assert_eq!(
1559 props.compression(&ColumnPath::from("col")),
1560 Compression::SNAPPY
1561 );
1562 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1563 assert_eq!(
1564 props.statistics_enabled(&ColumnPath::from("col")),
1565 EnabledStatistics::Chunk
1566 );
1567 assert_eq!(
1568 props.bloom_filter_properties(&ColumnPath::from("col")),
1569 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1570 );
1571 }
1572
1573 // Test direct build of properties
1574 test_props(&props);
1575
1576 // Test that into_builder() gives the same result
1577 let props_into_builder_and_back = props.into_builder().build();
1578 test_props(&props_into_builder_and_back);
1579 }
1580
1581 #[test]
1582 fn test_writer_properties_builder_partial_defaults() {
1583 let props = WriterProperties::builder()
1584 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1585 .set_compression(Compression::GZIP(Default::default()))
1586 .set_bloom_filter_enabled(true)
1587 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1588 .build();
1589
1590 assert_eq!(
1591 props.encoding(&ColumnPath::from("col")),
1592 Some(Encoding::RLE)
1593 );
1594 assert_eq!(
1595 props.compression(&ColumnPath::from("col")),
1596 Compression::GZIP(Default::default())
1597 );
1598 assert_eq!(
1599 props.dictionary_enabled(&ColumnPath::from("col")),
1600 DEFAULT_DICTIONARY_ENABLED
1601 );
1602 assert_eq!(
1603 props.bloom_filter_properties(&ColumnPath::from("col")),
1604 Some(&BloomFilterProperties {
1605 fpp: 0.05,
1606 ndv: 1_000_000_u64
1607 })
1608 );
1609 }
1610
1611 #[test]
1612 #[allow(deprecated)]
1613 fn test_writer_properties_deprecated_max_row_group_size_still_works() {
1614 let props = WriterProperties::builder()
1615 .set_max_row_group_size(42)
1616 .build();
1617
1618 assert_eq!(props.max_row_group_row_count(), Some(42));
1619 assert_eq!(props.max_row_group_size(), 42);
1620 }
1621
1622 #[test]
1623 #[should_panic(expected = "Cannot have a 0 max row group row count")]
1624 fn test_writer_properties_panic_on_zero_row_group_row_count() {
1625 let _ = WriterProperties::builder().set_max_row_group_row_count(Some(0));
1626 }
1627
1628 #[test]
1629 #[should_panic(expected = "Cannot have a 0 max row group bytes")]
1630 fn test_writer_properties_panic_on_zero_row_group_bytes() {
1631 let _ = WriterProperties::builder().set_max_row_group_bytes(Some(0));
1632 }
1633
1634 #[test]
1635 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1636 assert_eq!(
1637 WriterProperties::builder()
1638 .build()
1639 .bloom_filter_properties(&ColumnPath::from("col")),
1640 None
1641 );
1642 assert_eq!(
1643 WriterProperties::builder()
1644 .set_bloom_filter_ndv(100)
1645 .build()
1646 .bloom_filter_properties(&ColumnPath::from("col")),
1647 Some(&BloomFilterProperties {
1648 fpp: 0.05,
1649 ndv: 100
1650 })
1651 );
1652 assert_eq!(
1653 WriterProperties::builder()
1654 .set_bloom_filter_fpp(0.1)
1655 .build()
1656 .bloom_filter_properties(&ColumnPath::from("col")),
1657 Some(&BloomFilterProperties {
1658 fpp: 0.1,
1659 ndv: 1_000_000_u64
1660 })
1661 );
1662 }
1663
1664 #[test]
1665 fn test_writer_properties_column_dictionary_page_size_limit() {
1666 let props = WriterProperties::builder()
1667 .set_dictionary_page_size_limit(100)
1668 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1669 .build();
1670
1671 assert_eq!(props.dictionary_page_size_limit(), 100);
1672 assert_eq!(
1673 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1674 10
1675 );
1676 assert_eq!(
1677 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1678 100
1679 );
1680 }
1681
1682 #[test]
1683 fn test_writer_properties_column_data_page_size_limit() {
1684 let props = WriterProperties::builder()
1685 .set_data_page_size_limit(100)
1686 .set_column_data_page_size_limit(ColumnPath::from("col"), 10)
1687 .build();
1688
1689 assert_eq!(props.data_page_size_limit(), 100);
1690 assert_eq!(
1691 props.column_data_page_size_limit(&ColumnPath::from("col")),
1692 10
1693 );
1694 assert_eq!(
1695 props.column_data_page_size_limit(&ColumnPath::from("other")),
1696 100
1697 );
1698 }
1699
1700 #[test]
1701 fn test_reader_properties_default_settings() {
1702 let props = ReaderProperties::builder().build();
1703
1704 let codec_options = CodecOptionsBuilder::default()
1705 .set_backward_compatible_lz4(true)
1706 .build();
1707
1708 assert_eq!(props.codec_options(), &codec_options);
1709 assert!(!props.read_bloom_filter());
1710 }
1711
1712 #[test]
1713 fn test_reader_properties_builder() {
1714 let props = ReaderProperties::builder()
1715 .set_backward_compatible_lz4(false)
1716 .build();
1717
1718 let codec_options = CodecOptionsBuilder::default()
1719 .set_backward_compatible_lz4(false)
1720 .build();
1721
1722 assert_eq!(props.codec_options(), &codec_options);
1723 }
1724
1725 #[test]
1726 fn test_parse_writerversion() {
1727 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1728 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1729 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1730 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1731
1732 // test lowercase
1733 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1734 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1735
1736 // test invalid version
1737 match "PARQUET_-1_0".parse::<WriterVersion>() {
1738 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1739 Err(e) => {
1740 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1741 }
1742 }
1743 }
1744
1745 #[test]
1746 fn test_parse_enabledstatistics() {
1747 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1748 assert_eq!(enabled_statistics, EnabledStatistics::None);
1749 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1750 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1751 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1752 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1753
1754 // test lowercase
1755 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1756 assert_eq!(enabled_statistics, EnabledStatistics::None);
1757
1758 //test invalid statistics
1759 match "ChunkAndPage".parse::<EnabledStatistics>() {
1760 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1761 Err(e) => {
1762 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1763 }
1764 }
1765 }
1766}