parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::write_page_header_statistics`]
46pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
47/// Default value for [`WriterProperties::max_row_group_size`]
48pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
49/// Default value for [`WriterProperties::bloom_filter_position`]
50pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
51/// Default value for [`WriterProperties::created_by`]
52pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
53/// Default value for [`WriterProperties::column_index_truncate_length`]
54pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
55/// Default value for [`BloomFilterProperties::fpp`]
56pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
57/// Default value for [`BloomFilterProperties::ndv`]
58pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
59/// Default values for [`WriterProperties::statistics_truncate_length`]
60pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
61/// Default value for [`WriterProperties::offset_index_disabled`]
62pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
63/// Default values for [`WriterProperties::coerce_types`]
64pub const DEFAULT_COERCE_TYPES: bool = false;
65
66/// Parquet writer version.
67///
68/// Basic constant, which is not part of the Thrift definition.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70#[allow(non_camel_case_types)]
71pub enum WriterVersion {
72 /// Parquet format version 1.0
73 PARQUET_1_0,
74 /// Parquet format version 2.0
75 PARQUET_2_0,
76}
77
78impl WriterVersion {
79 /// Returns writer version as `i32`.
80 pub fn as_num(&self) -> i32 {
81 match self {
82 WriterVersion::PARQUET_1_0 => 1,
83 WriterVersion::PARQUET_2_0 => 2,
84 }
85 }
86}
87
88impl FromStr for WriterVersion {
89 type Err = String;
90
91 fn from_str(s: &str) -> Result<Self, Self::Err> {
92 match s {
93 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
94 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
95 _ => Err(format!("Invalid writer version: {s}")),
96 }
97 }
98}
99
100/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
101/// write Bloom filters
102///
103/// Basic constant, which is not part of the Thrift definition.
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum BloomFilterPosition {
106 /// Write Bloom Filters of each row group right after the row group
107 ///
108 /// This saves memory by writing it as soon as it is computed, at the cost
109 /// of data locality for readers
110 AfterRowGroup,
111 /// Write Bloom Filters at the end of the file
112 ///
113 /// This allows better data locality for readers, at the cost of memory usage
114 /// for writers.
115 End,
116}
117
118/// Reference counted writer properties.
119pub type WriterPropertiesPtr = Arc<WriterProperties>;
120
121/// Configuration settings for writing parquet files.
122///
123/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
124///
125/// # Example
126///
127/// ```rust
128/// # use parquet::{
129/// # basic::{Compression, Encoding},
130/// # file::properties::*,
131/// # schema::types::ColumnPath,
132/// # };
133/// #
134/// // Create properties with default configuration.
135/// let props = WriterProperties::default();
136///
137/// // Use properties builder to set certain options and assemble the configuration.
138/// let props = WriterProperties::builder()
139/// .set_writer_version(WriterVersion::PARQUET_1_0)
140/// .set_encoding(Encoding::PLAIN)
141/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
142/// .set_compression(Compression::SNAPPY)
143/// .build();
144///
145/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
146/// assert_eq!(
147/// props.encoding(&ColumnPath::from("col1")),
148/// Some(Encoding::DELTA_BINARY_PACKED)
149/// );
150/// assert_eq!(
151/// props.encoding(&ColumnPath::from("col2")),
152/// Some(Encoding::PLAIN)
153/// );
154/// ```
155#[derive(Debug, Clone)]
156pub struct WriterProperties {
157 data_page_size_limit: usize,
158 data_page_row_count_limit: usize,
159 write_batch_size: usize,
160 max_row_group_size: usize,
161 bloom_filter_position: BloomFilterPosition,
162 writer_version: WriterVersion,
163 created_by: String,
164 offset_index_disabled: bool,
165 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
166 default_column_properties: ColumnProperties,
167 column_properties: HashMap<ColumnPath, ColumnProperties>,
168 sorting_columns: Option<Vec<SortingColumn>>,
169 column_index_truncate_length: Option<usize>,
170 statistics_truncate_length: Option<usize>,
171 coerce_types: bool,
172 #[cfg(feature = "encryption")]
173 pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
174}
175
176impl Default for WriterProperties {
177 fn default() -> Self {
178 Self::builder().build()
179 }
180}
181
182impl WriterProperties {
183 /// Create a new [`WriterProperties`] with the default settings
184 ///
185 /// See [`WriterProperties::builder`] for customising settings
186 pub fn new() -> Self {
187 Self::default()
188 }
189
190 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
191 /// properties.
192 pub fn builder() -> WriterPropertiesBuilder {
193 WriterPropertiesBuilder::default()
194 }
195
196 /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
197 /// Used for mutating existing property settings
198 pub fn into_builder(self) -> WriterPropertiesBuilder {
199 self.into()
200 }
201
202 /// Returns data page size limit.
203 ///
204 /// Note: this is a best effort limit based on the write batch size
205 ///
206 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
207 pub fn data_page_size_limit(&self) -> usize {
208 self.data_page_size_limit
209 }
210
211 /// Returns dictionary page size limit.
212 ///
213 /// Note: this is a best effort limit based on the write batch size
214 ///
215 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
216 pub fn dictionary_page_size_limit(&self) -> usize {
217 self.default_column_properties
218 .dictionary_page_size_limit()
219 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
220 }
221
222 /// Returns dictionary page size limit for a specific column.
223 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
224 self.column_properties
225 .get(col)
226 .and_then(|c| c.dictionary_page_size_limit())
227 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
228 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
229 }
230
231 /// Returns the maximum page row count
232 ///
233 /// Note: this is a best effort limit based on the write batch size
234 ///
235 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
236 pub fn data_page_row_count_limit(&self) -> usize {
237 self.data_page_row_count_limit
238 }
239
240 /// Returns configured batch size for writes.
241 ///
242 /// When writing a batch of data, this setting allows to split it internally into
243 /// smaller batches so we can better estimate the size of a page currently being
244 /// written.
245 ///
246 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
247 pub fn write_batch_size(&self) -> usize {
248 self.write_batch_size
249 }
250
251 /// Returns maximum number of rows in a row group.
252 ///
253 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
254 pub fn max_row_group_size(&self) -> usize {
255 self.max_row_group_size
256 }
257
258 /// Returns bloom filter position.
259 ///
260 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
261 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
262 self.bloom_filter_position
263 }
264
265 /// Returns configured writer version.
266 ///
267 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
268 pub fn writer_version(&self) -> WriterVersion {
269 self.writer_version
270 }
271
272 /// Returns `created_by` string.
273 ///
274 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
275 pub fn created_by(&self) -> &str {
276 &self.created_by
277 }
278
279 /// Returns `true` if offset index writing is disabled.
280 ///
281 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
282 pub fn offset_index_disabled(&self) -> bool {
283 // If page statistics are to be collected, then do not disable the offset indexes.
284 let default_page_stats_enabled =
285 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
286 let column_page_stats_enabled = self
287 .column_properties
288 .iter()
289 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
290 if default_page_stats_enabled || column_page_stats_enabled {
291 return false;
292 }
293
294 self.offset_index_disabled
295 }
296
297 /// Returns `key_value_metadata` KeyValue pairs.
298 ///
299 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
300 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
301 self.key_value_metadata.as_ref()
302 }
303
304 /// Returns sorting columns.
305 ///
306 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
307 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
308 self.sorting_columns.as_ref()
309 }
310
311 /// Returns the maximum length of truncated min/max values in the column index.
312 ///
313 /// `None` if truncation is disabled, must be greater than 0 otherwise.
314 ///
315 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
316 pub fn column_index_truncate_length(&self) -> Option<usize> {
317 self.column_index_truncate_length
318 }
319
320 /// Returns the maximum length of truncated min/max values in [`Statistics`].
321 ///
322 /// `None` if truncation is disabled, must be greater than 0 otherwise.
323 ///
324 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
325 ///
326 /// [`Statistics`]: crate::file::statistics::Statistics
327 pub fn statistics_truncate_length(&self) -> Option<usize> {
328 self.statistics_truncate_length
329 }
330
331 /// Returns `true` if type coercion is enabled.
332 ///
333 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
334 pub fn coerce_types(&self) -> bool {
335 self.coerce_types
336 }
337
338 /// Returns encoding for a data page, when dictionary encoding is enabled.
339 ///
340 /// This is not configurable.
341 #[inline]
342 pub fn dictionary_data_page_encoding(&self) -> Encoding {
343 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
344 // Dictionary values are encoded using RLE_DICTIONARY encoding.
345 Encoding::RLE_DICTIONARY
346 }
347
348 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
349 ///
350 /// This is not configurable.
351 #[inline]
352 pub fn dictionary_page_encoding(&self) -> Encoding {
353 // PLAIN_DICTIONARY is deprecated in writer version 1.
354 // Dictionary is encoded using plain encoding.
355 Encoding::PLAIN
356 }
357
358 /// Returns encoding for a column, if set.
359 ///
360 /// In case when dictionary is enabled, returns fallback encoding.
361 ///
362 /// If encoding is not set, then column writer will choose the best encoding
363 /// based on the column type.
364 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
365 self.column_properties
366 .get(col)
367 .and_then(|c| c.encoding())
368 .or_else(|| self.default_column_properties.encoding())
369 }
370
371 /// Returns compression codec for a column.
372 ///
373 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
374 pub fn compression(&self, col: &ColumnPath) -> Compression {
375 self.column_properties
376 .get(col)
377 .and_then(|c| c.compression())
378 .or_else(|| self.default_column_properties.compression())
379 .unwrap_or(DEFAULT_COMPRESSION)
380 }
381
382 /// Returns `true` if dictionary encoding is enabled for a column.
383 ///
384 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
385 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
386 self.column_properties
387 .get(col)
388 .and_then(|c| c.dictionary_enabled())
389 .or_else(|| self.default_column_properties.dictionary_enabled())
390 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
391 }
392
393 /// Returns which statistics are written for a column.
394 ///
395 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
396 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
397 self.column_properties
398 .get(col)
399 .and_then(|c| c.statistics_enabled())
400 .or_else(|| self.default_column_properties.statistics_enabled())
401 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
402 }
403
404 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
405 ///
406 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
407 ///
408 /// [`Statistics`]: crate::file::statistics::Statistics
409 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
410 self.column_properties
411 .get(col)
412 .and_then(|c| c.write_page_header_statistics())
413 .or_else(|| {
414 self.default_column_properties
415 .write_page_header_statistics()
416 })
417 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
418 }
419
420 /// Returns the [`BloomFilterProperties`] for the given column
421 ///
422 /// Returns `None` if bloom filter is disabled
423 ///
424 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
425 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
426 self.column_properties
427 .get(col)
428 .and_then(|c| c.bloom_filter_properties())
429 .or_else(|| self.default_column_properties.bloom_filter_properties())
430 }
431
432 /// Return file encryption properties
433 ///
434 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
435 #[cfg(feature = "encryption")]
436 pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
437 self.file_encryption_properties.as_ref()
438 }
439}
440
441/// Builder for [`WriterProperties`] Parquet writer configuration.
442///
443/// See example on [`WriterProperties`]
444#[derive(Debug, Clone)]
445pub struct WriterPropertiesBuilder {
446 data_page_size_limit: usize,
447 data_page_row_count_limit: usize,
448 write_batch_size: usize,
449 max_row_group_size: usize,
450 bloom_filter_position: BloomFilterPosition,
451 writer_version: WriterVersion,
452 created_by: String,
453 offset_index_disabled: bool,
454 key_value_metadata: Option<Vec<KeyValue>>,
455 default_column_properties: ColumnProperties,
456 column_properties: HashMap<ColumnPath, ColumnProperties>,
457 sorting_columns: Option<Vec<SortingColumn>>,
458 column_index_truncate_length: Option<usize>,
459 statistics_truncate_length: Option<usize>,
460 coerce_types: bool,
461 #[cfg(feature = "encryption")]
462 file_encryption_properties: Option<FileEncryptionProperties>,
463}
464
465impl Default for WriterPropertiesBuilder {
466 /// Returns default state of the builder.
467 fn default() -> Self {
468 Self {
469 data_page_size_limit: DEFAULT_PAGE_SIZE,
470 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
471 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
472 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
473 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
474 writer_version: DEFAULT_WRITER_VERSION,
475 created_by: DEFAULT_CREATED_BY.to_string(),
476 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
477 key_value_metadata: None,
478 default_column_properties: Default::default(),
479 column_properties: HashMap::new(),
480 sorting_columns: None,
481 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
482 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
483 coerce_types: DEFAULT_COERCE_TYPES,
484 #[cfg(feature = "encryption")]
485 file_encryption_properties: None,
486 }
487 }
488}
489
490impl WriterPropertiesBuilder {
491 /// Finalizes the configuration and returns immutable writer properties struct.
492 pub fn build(self) -> WriterProperties {
493 WriterProperties {
494 data_page_size_limit: self.data_page_size_limit,
495 data_page_row_count_limit: self.data_page_row_count_limit,
496 write_batch_size: self.write_batch_size,
497 max_row_group_size: self.max_row_group_size,
498 bloom_filter_position: self.bloom_filter_position,
499 writer_version: self.writer_version,
500 created_by: self.created_by,
501 offset_index_disabled: self.offset_index_disabled,
502 key_value_metadata: self.key_value_metadata,
503 default_column_properties: self.default_column_properties,
504 column_properties: self.column_properties,
505 sorting_columns: self.sorting_columns,
506 column_index_truncate_length: self.column_index_truncate_length,
507 statistics_truncate_length: self.statistics_truncate_length,
508 coerce_types: self.coerce_types,
509 #[cfg(feature = "encryption")]
510 file_encryption_properties: self.file_encryption_properties,
511 }
512 }
513
514 // ----------------------------------------------------------------------
515 // Writer properties related to a file
516
517 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
518 /// via [`DEFAULT_WRITER_VERSION`])
519 ///
520 /// This value can determine what features some readers will support.
521 ///
522 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
523 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
524 self.writer_version = value;
525 self
526 }
527
528 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
529 /// via [`DEFAULT_PAGE_SIZE`]).
530 ///
531 /// The parquet writer will attempt to limit the sizes of each
532 /// `DataPage` to this many bytes. Reducing this value will result
533 /// in larger parquet files, but may improve the effectiveness of
534 /// page index based predicate pushdown during reading.
535 ///
536 /// Note: this is a best effort limit based on value of
537 /// [`set_write_batch_size`](Self::set_write_batch_size).
538 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
539 self.data_page_size_limit = value;
540 self
541 }
542
543 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
544 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
545 ///
546 /// The parquet writer will attempt to limit the number of rows in
547 /// each `DataPage` to this value. Reducing this value will result
548 /// in larger parquet files, but may improve the effectiveness of
549 /// page index based predicate pushdown during reading.
550 ///
551 /// Note: this is a best effort limit based on value of
552 /// [`set_write_batch_size`](Self::set_write_batch_size).
553 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
554 self.data_page_row_count_limit = value;
555 self
556 }
557
558 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
559 ///
560 /// For performance reasons, data for each column is written in
561 /// batches of this size.
562 ///
563 /// Additional limits such as such as
564 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
565 /// are checked between batches, and thus the write batch size value acts as an
566 /// upper-bound on the enforcement granularity of other limits.
567 pub fn set_write_batch_size(mut self, value: usize) -> Self {
568 self.write_batch_size = value;
569 self
570 }
571
572 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
573 /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
574 ///
575 /// # Panics
576 /// If the value is set to 0.
577 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
578 assert!(value > 0, "Cannot have a 0 max row group size");
579 self.max_row_group_size = value;
580 self
581 }
582
583 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
584 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
585 ///
586 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
587 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
588 self.bloom_filter_position = value;
589 self
590 }
591
592 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
593 /// [`DEFAULT_CREATED_BY`]).
594 ///
595 /// This is a string that will be written into the file metadata
596 pub fn set_created_by(mut self, value: String) -> Self {
597 self.created_by = value;
598 self
599 }
600
601 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
602 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
603 ///
604 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
605 ///
606 /// Note: As the offset indexes are useful for accessing data by row number,
607 /// they are always written by default, regardless of whether other statistics
608 /// are enabled. Disabling this metadata may result in a degradation in read
609 /// performance, so use this option with care.
610 ///
611 /// [`Page`]: EnabledStatistics::Page
612 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
613 self.offset_index_disabled = value;
614 self
615 }
616
617 /// Sets "key_value_metadata" property (defaults to `None`).
618 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
619 self.key_value_metadata = value;
620 self
621 }
622
623 /// Sets sorting order of rows in the row group if any (defaults to `None`).
624 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
625 self.sorting_columns = value;
626 self
627 }
628
629 /// Sets the max length of min/max value fields when writing the column
630 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
631 ///
632 /// This can be used to prevent columns with very long values (hundreds of
633 /// bytes long) from causing the parquet metadata to become huge.
634 ///
635 /// # Notes
636 ///
637 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
638 /// set to [`EnabledStatistics::Page`].
639 ///
640 /// * If `Some`, must be greater than 0, otherwise will panic
641 /// * If `None`, there's no effective limit.
642 ///
643 /// [`Index`]: crate::file::page_index::index::Index
644 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
645 if let Some(value) = max_length {
646 assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
647 }
648
649 self.column_index_truncate_length = max_length;
650 self
651 }
652
653 /// Sets the max length of min/max value fields in row group and data page header
654 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
655 ///
656 /// # Notes
657 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
658 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
659 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
660 /// [`EnabledStatistics::Page`].
661 ///
662 /// * If `Some`, must be greater than 0, otherwise will panic
663 /// * If `None`, there's no effective limit.
664 ///
665 /// # See also
666 /// Truncation of Page Index statistics is controlled separately via
667 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
668 ///
669 /// [`Statistics`]: crate::file::statistics::Statistics
670 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
671 if let Some(value) = max_length {
672 assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
673 }
674
675 self.statistics_truncate_length = max_length;
676 self
677 }
678
679 /// Should the writer coerce types to parquet native types (defaults to `false` via
680 /// [`DEFAULT_COERCE_TYPES`]).
681 ///
682 /// Leaving this option the default `false` will ensure the exact same data
683 /// written to parquet using this library will be read.
684 ///
685 /// Setting this option to `true` will result in parquet files that can be
686 /// read by more readers, but potentially lose information in the process.
687 ///
688 /// * Types such as [`DataType::Date64`], which have no direct corresponding
689 /// Parquet type, may be stored with lower precision.
690 ///
691 /// * The internal field names of `List` and `Map` types will be renamed if
692 /// necessary to match what is required by the newest Parquet specification.
693 ///
694 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
695 ///
696 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
697 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
698 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
699 self.coerce_types = coerce_types;
700 self
701 }
702
703 /// Sets FileEncryptionProperties (defaults to `None`)
704 #[cfg(feature = "encryption")]
705 pub fn with_file_encryption_properties(
706 mut self,
707 file_encryption_properties: FileEncryptionProperties,
708 ) -> Self {
709 self.file_encryption_properties = Some(file_encryption_properties);
710 self
711 }
712
713 // ----------------------------------------------------------------------
714 // Setters for any column (global)
715
716 /// Sets default encoding for all columns.
717 ///
718 /// If dictionary is not enabled, this is treated as a primary encoding for all
719 /// columns. In case when dictionary is enabled for any column, this value is
720 /// considered to be a fallback encoding for that column.
721 ///
722 /// # Panics
723 ///
724 /// if dictionary encoding is specified, regardless of dictionary
725 /// encoding flag being set.
726 pub fn set_encoding(mut self, value: Encoding) -> Self {
727 self.default_column_properties.set_encoding(value);
728 self
729 }
730
731 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
732 /// [`DEFAULT_COMPRESSION`]).
733 ///
734 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
735 pub fn set_compression(mut self, value: Compression) -> Self {
736 self.default_column_properties.set_compression(value);
737 self
738 }
739
740 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
741 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
742 ///
743 /// Use this method to set dictionary encoding, instead of explicitly specifying
744 /// encoding in `set_encoding` method.
745 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
746 self.default_column_properties.set_dictionary_enabled(value);
747 self
748 }
749
750 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
751 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
752 ///
753 /// The parquet writer will attempt to limit the size of each
754 /// `DataPage` used to store dictionaries to this many
755 /// bytes. Reducing this value will result in larger parquet
756 /// files, but may improve the effectiveness of page index based
757 /// predicate pushdown during reading.
758 ///
759 /// Note: this is a best effort limit based on value of
760 /// [`set_write_batch_size`](Self::set_write_batch_size).
761 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
762 self.default_column_properties
763 .set_dictionary_page_size_limit(value);
764 self
765 }
766
767 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
768 /// [`DEFAULT_STATISTICS_ENABLED`]).
769 ///
770 /// [`Page`]: EnabledStatistics::Page
771 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
772 self.default_column_properties.set_statistics_enabled(value);
773 self
774 }
775
776 /// enable/disable writing [`Statistics`] in the page header
777 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
778 ///
779 /// Only applicable if [`Page`] level statistics are gathered.
780 ///
781 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
782 /// file while yielding very little added benefit. Most modern Parquet implementations
783 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
784 /// those in the page header.
785 ///
786 /// # Note
787 ///
788 /// Prior to version 56.0.0, the `parquet` crate always wrote these
789 /// statistics (the equivalent of setting this option to `true`). This was
790 /// changed in 56.0.0 to follow the recommendation in the Parquet
791 /// specification. See [issue #7580] for more details.
792 ///
793 /// [`Statistics`]: crate::file::statistics::Statistics
794 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
795 /// [`Page`]: EnabledStatistics::Page
796 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
797 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
798 self.default_column_properties
799 .set_write_page_header_statistics(value);
800 self
801 }
802
803 /// Sets if bloom filter should be written for all columns (defaults to `false`).
804 ///
805 /// # Notes
806 ///
807 /// * If the bloom filter is enabled previously then it is a no-op.
808 ///
809 /// * If the bloom filter is not enabled, default values for ndv and fpp
810 /// value are used used. See [`set_bloom_filter_ndv`] and
811 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
812 ///
813 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
814 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
815 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
816 self.default_column_properties
817 .set_bloom_filter_enabled(value);
818 self
819 }
820
821 /// Sets the default target bloom filter false positive probability (fpp)
822 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
823 ///
824 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
825 /// been called.
826 ///
827 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
828 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
829 self.default_column_properties.set_bloom_filter_fpp(value);
830 self
831 }
832
833 /// Sets default number of distinct values (ndv) for bloom filter for all
834 /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
835 ///
836 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
837 /// been called.
838 ///
839 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
840 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
841 self.default_column_properties.set_bloom_filter_ndv(value);
842 self
843 }
844
845 // ----------------------------------------------------------------------
846 // Setters for a specific column
847
848 /// Helper method to get existing or new mutable reference of column properties.
849 #[inline]
850 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
851 self.column_properties.entry(col).or_default()
852 }
853
854 /// Sets encoding for a specific column.
855 ///
856 /// Takes precedence over [`Self::set_encoding`].
857 ///
858 /// If dictionary is not enabled, this is treated as a primary encoding for this
859 /// column. In case when dictionary is enabled for this column, either through
860 /// global defaults or explicitly, this value is considered to be a fallback
861 /// encoding for this column.
862 ///
863 /// # Panics
864 /// If user tries to set dictionary encoding here, regardless of dictionary
865 /// encoding flag being set.
866 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
867 self.get_mut_props(col).set_encoding(value);
868 self
869 }
870
871 /// Sets compression codec for a specific column.
872 ///
873 /// Takes precedence over [`Self::set_compression`].
874 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
875 self.get_mut_props(col).set_compression(value);
876 self
877 }
878
879 /// Sets flag to enable/disable dictionary encoding for a specific column.
880 ///
881 /// Takes precedence over [`Self::set_dictionary_enabled`].
882 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
883 self.get_mut_props(col).set_dictionary_enabled(value);
884 self
885 }
886
887 /// Sets dictionary page size limit for a specific column.
888 ///
889 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
890 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
891 self.get_mut_props(col)
892 .set_dictionary_page_size_limit(value);
893 self
894 }
895
896 /// Sets [`EnabledStatistics`] level for a specific column.
897 ///
898 /// Takes precedence over [`Self::set_statistics_enabled`].
899 pub fn set_column_statistics_enabled(
900 mut self,
901 col: ColumnPath,
902 value: EnabledStatistics,
903 ) -> Self {
904 self.get_mut_props(col).set_statistics_enabled(value);
905 self
906 }
907
908 /// Sets whether to write [`Statistics`] in the page header for a specific column.
909 ///
910 /// Takes precedence over [`Self::set_write_page_header_statistics`].
911 ///
912 /// [`Statistics`]: crate::file::statistics::Statistics
913 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
914 self.get_mut_props(col)
915 .set_write_page_header_statistics(value);
916 self
917 }
918
919 /// Sets whether a bloom filter should be written for a specific column.
920 ///
921 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
922 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
923 self.get_mut_props(col).set_bloom_filter_enabled(value);
924 self
925 }
926
927 /// Sets the false positive probability for bloom filter for a specific column.
928 ///
929 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
930 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
931 self.get_mut_props(col).set_bloom_filter_fpp(value);
932 self
933 }
934
935 /// Sets the number of distinct values for bloom filter for a specific column.
936 ///
937 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
938 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
939 self.get_mut_props(col).set_bloom_filter_ndv(value);
940 self
941 }
942}
943
944impl From<WriterProperties> for WriterPropertiesBuilder {
945 fn from(props: WriterProperties) -> Self {
946 WriterPropertiesBuilder {
947 data_page_size_limit: props.data_page_size_limit,
948 data_page_row_count_limit: props.data_page_row_count_limit,
949 write_batch_size: props.write_batch_size,
950 max_row_group_size: props.max_row_group_size,
951 bloom_filter_position: props.bloom_filter_position,
952 writer_version: props.writer_version,
953 created_by: props.created_by,
954 offset_index_disabled: props.offset_index_disabled,
955 key_value_metadata: props.key_value_metadata,
956 default_column_properties: props.default_column_properties,
957 column_properties: props.column_properties,
958 sorting_columns: props.sorting_columns,
959 column_index_truncate_length: props.column_index_truncate_length,
960 statistics_truncate_length: props.statistics_truncate_length,
961 coerce_types: props.coerce_types,
962 #[cfg(feature = "encryption")]
963 file_encryption_properties: props.file_encryption_properties,
964 }
965 }
966}
967
968/// Controls the level of statistics to be computed by the writer and stored in
969/// the parquet file.
970///
971/// Enabling statistics makes the resulting Parquet file larger and requires
972/// more time to read the parquet footer.
973///
974/// Statistics can be used to improve query performance by pruning row groups
975/// and pages during query execution if the query engine supports evaluating the
976/// predicate using the statistics.
977#[derive(Debug, Clone, Copy, Eq, PartialEq)]
978pub enum EnabledStatistics {
979 /// Compute no statistics.
980 None,
981 /// Compute column chunk-level statistics but not page-level.
982 ///
983 /// Setting this option will store one set of statistics for each relevant
984 /// column for each row group. The more row groups written, the more
985 /// statistics will be stored.
986 Chunk,
987 /// Compute page-level and column chunk-level statistics.
988 ///
989 /// Setting this option will store one set of statistics for each relevant
990 /// column for each row group. In addition, this will enable the writing
991 /// of the column index (the offset index is always written regardless of
992 /// this setting). See [`ParquetColumnIndex`] for
993 /// more information.
994 ///
995 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
996 Page,
997}
998
999impl FromStr for EnabledStatistics {
1000 type Err = String;
1001
1002 fn from_str(s: &str) -> Result<Self, Self::Err> {
1003 match s {
1004 "NONE" | "none" => Ok(EnabledStatistics::None),
1005 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1006 "PAGE" | "page" => Ok(EnabledStatistics::Page),
1007 _ => Err(format!("Invalid statistics arg: {s}")),
1008 }
1009 }
1010}
1011
1012impl Default for EnabledStatistics {
1013 fn default() -> Self {
1014 DEFAULT_STATISTICS_ENABLED
1015 }
1016}
1017
1018/// Controls the bloom filter to be computed by the writer.
1019#[derive(Debug, Clone, PartialEq)]
1020pub struct BloomFilterProperties {
1021 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1022 ///
1023 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1024 ///
1025 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1026 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1027 /// e.g. 0.1, 0.05, or 0.001 is recommended.
1028 ///
1029 /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
1030 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1031 /// be known in advance to greatly reduce space usage.
1032 pub fpp: f64,
1033 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1034 ///
1035 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1036 ///
1037 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1038 /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1039 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1040 /// anyway.
1041 ///
1042 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1043 pub ndv: u64,
1044}
1045
1046impl Default for BloomFilterProperties {
1047 fn default() -> Self {
1048 BloomFilterProperties {
1049 fpp: DEFAULT_BLOOM_FILTER_FPP,
1050 ndv: DEFAULT_BLOOM_FILTER_NDV,
1051 }
1052 }
1053}
1054
1055/// Container for column properties that can be changed as part of writer.
1056///
1057/// If a field is `None`, it means that no specific value has been set for this column,
1058/// so some subsequent or default value must be used.
1059#[derive(Debug, Clone, Default, PartialEq)]
1060struct ColumnProperties {
1061 encoding: Option<Encoding>,
1062 codec: Option<Compression>,
1063 dictionary_page_size_limit: Option<usize>,
1064 dictionary_enabled: Option<bool>,
1065 statistics_enabled: Option<EnabledStatistics>,
1066 write_page_header_statistics: Option<bool>,
1067 /// bloom filter related properties
1068 bloom_filter_properties: Option<BloomFilterProperties>,
1069}
1070
1071impl ColumnProperties {
1072 /// Sets encoding for this column.
1073 ///
1074 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1075 /// In case when dictionary is enabled for a column, this value is considered to
1076 /// be a fallback encoding.
1077 ///
1078 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1079 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1080 /// for a column.
1081 fn set_encoding(&mut self, value: Encoding) {
1082 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1083 panic!("Dictionary encoding can not be used as fallback encoding");
1084 }
1085 self.encoding = Some(value);
1086 }
1087
1088 /// Sets compression codec for this column.
1089 fn set_compression(&mut self, value: Compression) {
1090 self.codec = Some(value);
1091 }
1092
1093 /// Sets whether dictionary encoding is enabled for this column.
1094 fn set_dictionary_enabled(&mut self, enabled: bool) {
1095 self.dictionary_enabled = Some(enabled);
1096 }
1097
1098 /// Sets dictionary page size limit for this column.
1099 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1100 self.dictionary_page_size_limit = Some(value);
1101 }
1102
1103 /// Sets the statistics level for this column.
1104 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1105 self.statistics_enabled = Some(enabled);
1106 }
1107
1108 /// Sets whether to write statistics in the page header for this column.
1109 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1110 self.write_page_header_statistics = Some(enabled);
1111 }
1112
1113 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1114 /// otherwise it is a no-op.
1115 /// If `value` is `false`, resets bloom filter properties to `None`.
1116 fn set_bloom_filter_enabled(&mut self, value: bool) {
1117 if value && self.bloom_filter_properties.is_none() {
1118 self.bloom_filter_properties = Some(Default::default())
1119 } else if !value {
1120 self.bloom_filter_properties = None
1121 }
1122 }
1123
1124 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1125 /// bloom filter if not previously enabled.
1126 ///
1127 /// # Panics
1128 ///
1129 /// Panics if the `value` is not between 0 and 1 exclusive
1130 fn set_bloom_filter_fpp(&mut self, value: f64) {
1131 assert!(
1132 value > 0. && value < 1.0,
1133 "fpp must be between 0 and 1 exclusive, got {value}"
1134 );
1135
1136 self.bloom_filter_properties
1137 .get_or_insert_with(Default::default)
1138 .fpp = value;
1139 }
1140
1141 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1142 /// enables bloom filter if not previously enabled.
1143 fn set_bloom_filter_ndv(&mut self, value: u64) {
1144 self.bloom_filter_properties
1145 .get_or_insert_with(Default::default)
1146 .ndv = value;
1147 }
1148
1149 /// Returns optional encoding for this column.
1150 fn encoding(&self) -> Option<Encoding> {
1151 self.encoding
1152 }
1153
1154 /// Returns optional compression codec for this column.
1155 fn compression(&self) -> Option<Compression> {
1156 self.codec
1157 }
1158
1159 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1160 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1161 /// been provided.
1162 fn dictionary_enabled(&self) -> Option<bool> {
1163 self.dictionary_enabled
1164 }
1165
1166 /// Returns optional dictionary page size limit for this column.
1167 fn dictionary_page_size_limit(&self) -> Option<usize> {
1168 self.dictionary_page_size_limit
1169 }
1170
1171 /// Returns optional statistics level requested for this column. If result is `None`,
1172 /// then no setting has been provided.
1173 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1174 self.statistics_enabled
1175 }
1176
1177 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1178 /// column.
1179 ///
1180 /// [`Statistics`]: crate::file::statistics::Statistics
1181 fn write_page_header_statistics(&self) -> Option<bool> {
1182 self.write_page_header_statistics
1183 }
1184
1185 /// Returns the bloom filter properties, or `None` if not enabled
1186 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1187 self.bloom_filter_properties.as_ref()
1188 }
1189}
1190
1191/// Reference counted reader properties.
1192pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1193
1194const DEFAULT_READ_BLOOM_FILTER: bool = false;
1195
1196/// Configuration settings for reading parquet files.
1197///
1198/// All properties are immutable and `Send` + `Sync`.
1199/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1200///
1201/// # Example
1202///
1203/// ```rust
1204/// use parquet::file::properties::ReaderProperties;
1205///
1206/// // Create properties with default configuration.
1207/// let props = ReaderProperties::builder().build();
1208///
1209/// // Use properties builder to set certain options and assemble the configuration.
1210/// let props = ReaderProperties::builder()
1211/// .set_backward_compatible_lz4(false)
1212/// .build();
1213/// ```
1214pub struct ReaderProperties {
1215 codec_options: CodecOptions,
1216 read_bloom_filter: bool,
1217}
1218
1219impl ReaderProperties {
1220 /// Returns builder for reader properties with default values.
1221 pub fn builder() -> ReaderPropertiesBuilder {
1222 ReaderPropertiesBuilder::with_defaults()
1223 }
1224
1225 /// Returns codec options.
1226 pub(crate) fn codec_options(&self) -> &CodecOptions {
1227 &self.codec_options
1228 }
1229
1230 /// Returns whether to read bloom filter
1231 pub(crate) fn read_bloom_filter(&self) -> bool {
1232 self.read_bloom_filter
1233 }
1234}
1235
1236/// Builder for parquet file reader configuration. See example on
1237/// [`ReaderProperties`]
1238pub struct ReaderPropertiesBuilder {
1239 codec_options_builder: CodecOptionsBuilder,
1240 read_bloom_filter: Option<bool>,
1241}
1242
1243/// Reader properties builder.
1244impl ReaderPropertiesBuilder {
1245 /// Returns default state of the builder.
1246 fn with_defaults() -> Self {
1247 Self {
1248 codec_options_builder: CodecOptionsBuilder::default(),
1249 read_bloom_filter: None,
1250 }
1251 }
1252
1253 /// Finalizes the configuration and returns immutable reader properties struct.
1254 pub fn build(self) -> ReaderProperties {
1255 ReaderProperties {
1256 codec_options: self.codec_options_builder.build(),
1257 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1258 }
1259 }
1260
1261 /// Enable/disable backward compatible LZ4.
1262 ///
1263 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1264 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1265 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1266 /// compatibility with files generated by older versions of parquet-cpp.
1267 ///
1268 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1269 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1270 self.codec_options_builder = self
1271 .codec_options_builder
1272 .set_backward_compatible_lz4(value);
1273 self
1274 }
1275
1276 /// Enable/disable reading bloom filter
1277 ///
1278 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1279 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1280 ///
1281 /// By default bloom filter is set to be read.
1282 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1283 self.read_bloom_filter = Some(value);
1284 self
1285 }
1286}
1287
1288#[cfg(test)]
1289mod tests {
1290 use super::*;
1291
1292 #[test]
1293 fn test_writer_version() {
1294 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1295 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1296 }
1297
1298 #[test]
1299 fn test_writer_properties_default_settings() {
1300 let props = WriterProperties::default();
1301 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1302 assert_eq!(
1303 props.dictionary_page_size_limit(),
1304 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1305 );
1306 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1307 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1308 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1309 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1310 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1311 assert_eq!(props.key_value_metadata(), None);
1312 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1313 assert_eq!(
1314 props.compression(&ColumnPath::from("col")),
1315 DEFAULT_COMPRESSION
1316 );
1317 assert_eq!(
1318 props.dictionary_enabled(&ColumnPath::from("col")),
1319 DEFAULT_DICTIONARY_ENABLED
1320 );
1321 assert_eq!(
1322 props.statistics_enabled(&ColumnPath::from("col")),
1323 DEFAULT_STATISTICS_ENABLED
1324 );
1325 assert!(props
1326 .bloom_filter_properties(&ColumnPath::from("col"))
1327 .is_none());
1328 }
1329
1330 #[test]
1331 fn test_writer_properties_dictionary_encoding() {
1332 // dictionary encoding is not configurable, and it should be the same for both
1333 // writer version 1 and 2.
1334 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1335 let props = WriterProperties::builder()
1336 .set_writer_version(*version)
1337 .build();
1338 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1339 assert_eq!(
1340 props.dictionary_data_page_encoding(),
1341 Encoding::RLE_DICTIONARY
1342 );
1343 }
1344 }
1345
1346 #[test]
1347 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1348 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1349 // Should panic when user specifies dictionary encoding as fallback encoding.
1350 WriterProperties::builder()
1351 .set_encoding(Encoding::PLAIN_DICTIONARY)
1352 .build();
1353 }
1354
1355 #[test]
1356 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1357 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1358 // Should panic when user specifies dictionary encoding as fallback encoding.
1359 WriterProperties::builder()
1360 .set_encoding(Encoding::RLE_DICTIONARY)
1361 .build();
1362 }
1363
1364 #[test]
1365 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1366 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1367 WriterProperties::builder()
1368 .set_dictionary_enabled(true)
1369 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1370 .build();
1371 }
1372
1373 #[test]
1374 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1375 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1376 WriterProperties::builder()
1377 .set_dictionary_enabled(false)
1378 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1379 .build();
1380 }
1381
1382 #[test]
1383 fn test_writer_properties_builder() {
1384 let props = WriterProperties::builder()
1385 // file settings
1386 .set_writer_version(WriterVersion::PARQUET_2_0)
1387 .set_data_page_size_limit(10)
1388 .set_dictionary_page_size_limit(20)
1389 .set_write_batch_size(30)
1390 .set_max_row_group_size(40)
1391 .set_created_by("default".to_owned())
1392 .set_key_value_metadata(Some(vec![KeyValue::new(
1393 "key".to_string(),
1394 "value".to_string(),
1395 )]))
1396 // global column settings
1397 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1398 .set_compression(Compression::GZIP(Default::default()))
1399 .set_dictionary_enabled(false)
1400 .set_statistics_enabled(EnabledStatistics::None)
1401 // specific column settings
1402 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1403 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1404 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1405 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1406 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1407 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1408 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1409 .build();
1410
1411 fn test_props(props: &WriterProperties) {
1412 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1413 assert_eq!(props.data_page_size_limit(), 10);
1414 assert_eq!(props.dictionary_page_size_limit(), 20);
1415 assert_eq!(props.write_batch_size(), 30);
1416 assert_eq!(props.max_row_group_size(), 40);
1417 assert_eq!(props.created_by(), "default");
1418 assert_eq!(
1419 props.key_value_metadata(),
1420 Some(&vec![
1421 KeyValue::new("key".to_string(), "value".to_string(),)
1422 ])
1423 );
1424
1425 assert_eq!(
1426 props.encoding(&ColumnPath::from("a")),
1427 Some(Encoding::DELTA_BINARY_PACKED)
1428 );
1429 assert_eq!(
1430 props.compression(&ColumnPath::from("a")),
1431 Compression::GZIP(Default::default())
1432 );
1433 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1434 assert_eq!(
1435 props.statistics_enabled(&ColumnPath::from("a")),
1436 EnabledStatistics::None
1437 );
1438
1439 assert_eq!(
1440 props.encoding(&ColumnPath::from("col")),
1441 Some(Encoding::RLE)
1442 );
1443 assert_eq!(
1444 props.compression(&ColumnPath::from("col")),
1445 Compression::SNAPPY
1446 );
1447 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1448 assert_eq!(
1449 props.statistics_enabled(&ColumnPath::from("col")),
1450 EnabledStatistics::Chunk
1451 );
1452 assert_eq!(
1453 props.bloom_filter_properties(&ColumnPath::from("col")),
1454 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1455 );
1456 }
1457
1458 // Test direct build of properties
1459 test_props(&props);
1460
1461 // Test that into_builder() gives the same result
1462 let props_into_builder_and_back = props.into_builder().build();
1463 test_props(&props_into_builder_and_back);
1464 }
1465
1466 #[test]
1467 fn test_writer_properties_builder_partial_defaults() {
1468 let props = WriterProperties::builder()
1469 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1470 .set_compression(Compression::GZIP(Default::default()))
1471 .set_bloom_filter_enabled(true)
1472 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1473 .build();
1474
1475 assert_eq!(
1476 props.encoding(&ColumnPath::from("col")),
1477 Some(Encoding::RLE)
1478 );
1479 assert_eq!(
1480 props.compression(&ColumnPath::from("col")),
1481 Compression::GZIP(Default::default())
1482 );
1483 assert_eq!(
1484 props.dictionary_enabled(&ColumnPath::from("col")),
1485 DEFAULT_DICTIONARY_ENABLED
1486 );
1487 assert_eq!(
1488 props.bloom_filter_properties(&ColumnPath::from("col")),
1489 Some(&BloomFilterProperties {
1490 fpp: 0.05,
1491 ndv: 1_000_000_u64
1492 })
1493 );
1494 }
1495
1496 #[test]
1497 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1498 assert_eq!(
1499 WriterProperties::builder()
1500 .build()
1501 .bloom_filter_properties(&ColumnPath::from("col")),
1502 None
1503 );
1504 assert_eq!(
1505 WriterProperties::builder()
1506 .set_bloom_filter_ndv(100)
1507 .build()
1508 .bloom_filter_properties(&ColumnPath::from("col")),
1509 Some(&BloomFilterProperties {
1510 fpp: 0.05,
1511 ndv: 100
1512 })
1513 );
1514 assert_eq!(
1515 WriterProperties::builder()
1516 .set_bloom_filter_fpp(0.1)
1517 .build()
1518 .bloom_filter_properties(&ColumnPath::from("col")),
1519 Some(&BloomFilterProperties {
1520 fpp: 0.1,
1521 ndv: 1_000_000_u64
1522 })
1523 );
1524 }
1525
1526 #[test]
1527 fn test_writer_properties_column_dictionary_page_size_limit() {
1528 let props = WriterProperties::builder()
1529 .set_dictionary_page_size_limit(100)
1530 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1531 .build();
1532
1533 assert_eq!(props.dictionary_page_size_limit(), 100);
1534 assert_eq!(
1535 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1536 10
1537 );
1538 assert_eq!(
1539 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1540 100
1541 );
1542 }
1543
1544 #[test]
1545 fn test_reader_properties_default_settings() {
1546 let props = ReaderProperties::builder().build();
1547
1548 let codec_options = CodecOptionsBuilder::default()
1549 .set_backward_compatible_lz4(true)
1550 .build();
1551
1552 assert_eq!(props.codec_options(), &codec_options);
1553 assert!(!props.read_bloom_filter());
1554 }
1555
1556 #[test]
1557 fn test_reader_properties_builder() {
1558 let props = ReaderProperties::builder()
1559 .set_backward_compatible_lz4(false)
1560 .build();
1561
1562 let codec_options = CodecOptionsBuilder::default()
1563 .set_backward_compatible_lz4(false)
1564 .build();
1565
1566 assert_eq!(props.codec_options(), &codec_options);
1567 }
1568
1569 #[test]
1570 fn test_parse_writerversion() {
1571 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1572 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1573 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1574 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1575
1576 // test lowercase
1577 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1578 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1579
1580 // test invalid version
1581 match "PARQUET_-1_0".parse::<WriterVersion>() {
1582 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1583 Err(e) => {
1584 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1585 }
1586 }
1587 }
1588
1589 #[test]
1590 fn test_parse_enabledstatistics() {
1591 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1592 assert_eq!(enabled_statistics, EnabledStatistics::None);
1593 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1594 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1595 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1596 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1597
1598 // test lowercase
1599 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1600 assert_eq!(enabled_statistics, EnabledStatistics::None);
1601
1602 //test invalid statistics
1603 match "ChunkAndPage".parse::<EnabledStatistics>() {
1604 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1605 Err(e) => {
1606 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1607 }
1608 }
1609 }
1610}