parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::write_page_header_statistics`]
46pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
47/// Default value for [`WriterProperties::max_row_group_size`]
48pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
49/// Default value for [`WriterProperties::bloom_filter_position`]
50pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
51/// Default value for [`WriterProperties::created_by`]
52pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
53/// Default value for [`WriterProperties::column_index_truncate_length`]
54pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
55/// Default value for [`BloomFilterProperties::fpp`]
56pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
57/// Default value for [`BloomFilterProperties::ndv`]
58pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
59/// Default values for [`WriterProperties::statistics_truncate_length`]
60pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
61/// Default value for [`WriterProperties::offset_index_disabled`]
62pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
63/// Default values for [`WriterProperties::coerce_types`]
64pub const DEFAULT_COERCE_TYPES: bool = false;
65
66/// Parquet writer version.
67///
68/// Basic constant, which is not part of the Thrift definition.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70#[allow(non_camel_case_types)]
71pub enum WriterVersion {
72 /// Parquet format version 1.0
73 PARQUET_1_0,
74 /// Parquet format version 2.0
75 PARQUET_2_0,
76}
77
78impl WriterVersion {
79 /// Returns writer version as `i32`.
80 pub fn as_num(&self) -> i32 {
81 match self {
82 WriterVersion::PARQUET_1_0 => 1,
83 WriterVersion::PARQUET_2_0 => 2,
84 }
85 }
86}
87
88impl FromStr for WriterVersion {
89 type Err = String;
90
91 fn from_str(s: &str) -> Result<Self, Self::Err> {
92 match s {
93 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
94 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
95 _ => Err(format!("Invalid writer version: {s}")),
96 }
97 }
98}
99
100/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
101/// write Bloom filters
102///
103/// Basic constant, which is not part of the Thrift definition.
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum BloomFilterPosition {
106 /// Write Bloom Filters of each row group right after the row group
107 ///
108 /// This saves memory by writing it as soon as it is computed, at the cost
109 /// of data locality for readers
110 AfterRowGroup,
111 /// Write Bloom Filters at the end of the file
112 ///
113 /// This allows better data locality for readers, at the cost of memory usage
114 /// for writers.
115 End,
116}
117
118/// Reference counted writer properties.
119pub type WriterPropertiesPtr = Arc<WriterProperties>;
120
121/// Configuration settings for writing parquet files.
122///
123/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
124///
125/// # Example
126///
127/// ```rust
128/// # use parquet::{
129/// # basic::{Compression, Encoding},
130/// # file::properties::*,
131/// # schema::types::ColumnPath,
132/// # };
133/// #
134/// // Create properties with default configuration.
135/// let props = WriterProperties::default();
136///
137/// // Use properties builder to set certain options and assemble the configuration.
138/// let props = WriterProperties::builder()
139/// .set_writer_version(WriterVersion::PARQUET_1_0)
140/// .set_encoding(Encoding::PLAIN)
141/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
142/// .set_compression(Compression::SNAPPY)
143/// .build();
144///
145/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
146/// assert_eq!(
147/// props.encoding(&ColumnPath::from("col1")),
148/// Some(Encoding::DELTA_BINARY_PACKED)
149/// );
150/// assert_eq!(
151/// props.encoding(&ColumnPath::from("col2")),
152/// Some(Encoding::PLAIN)
153/// );
154/// ```
155#[derive(Debug, Clone)]
156pub struct WriterProperties {
157 data_page_size_limit: usize,
158 data_page_row_count_limit: usize,
159 write_batch_size: usize,
160 max_row_group_size: usize,
161 bloom_filter_position: BloomFilterPosition,
162 writer_version: WriterVersion,
163 created_by: String,
164 offset_index_disabled: bool,
165 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
166 default_column_properties: ColumnProperties,
167 column_properties: HashMap<ColumnPath, ColumnProperties>,
168 sorting_columns: Option<Vec<SortingColumn>>,
169 column_index_truncate_length: Option<usize>,
170 statistics_truncate_length: Option<usize>,
171 coerce_types: bool,
172 #[cfg(feature = "encryption")]
173 pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
174}
175
176impl Default for WriterProperties {
177 fn default() -> Self {
178 Self::builder().build()
179 }
180}
181
182impl WriterProperties {
183 /// Create a new [`WriterProperties`] with the default settings
184 ///
185 /// See [`WriterProperties::builder`] for customising settings
186 pub fn new() -> Self {
187 Self::default()
188 }
189
190 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
191 /// properties.
192 pub fn builder() -> WriterPropertiesBuilder {
193 WriterPropertiesBuilder::default()
194 }
195
196 /// Returns data page size limit.
197 ///
198 /// Note: this is a best effort limit based on the write batch size
199 ///
200 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
201 pub fn data_page_size_limit(&self) -> usize {
202 self.data_page_size_limit
203 }
204
205 /// Returns dictionary page size limit.
206 ///
207 /// Note: this is a best effort limit based on the write batch size
208 ///
209 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
210 pub fn dictionary_page_size_limit(&self) -> usize {
211 self.default_column_properties
212 .dictionary_page_size_limit()
213 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
214 }
215
216 /// Returns dictionary page size limit for a specific column.
217 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
218 self.column_properties
219 .get(col)
220 .and_then(|c| c.dictionary_page_size_limit())
221 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
222 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
223 }
224
225 /// Returns the maximum page row count
226 ///
227 /// Note: this is a best effort limit based on the write batch size
228 ///
229 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
230 pub fn data_page_row_count_limit(&self) -> usize {
231 self.data_page_row_count_limit
232 }
233
234 /// Returns configured batch size for writes.
235 ///
236 /// When writing a batch of data, this setting allows to split it internally into
237 /// smaller batches so we can better estimate the size of a page currently being
238 /// written.
239 ///
240 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
241 pub fn write_batch_size(&self) -> usize {
242 self.write_batch_size
243 }
244
245 /// Returns maximum number of rows in a row group.
246 ///
247 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
248 pub fn max_row_group_size(&self) -> usize {
249 self.max_row_group_size
250 }
251
252 /// Returns bloom filter position.
253 ///
254 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
255 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
256 self.bloom_filter_position
257 }
258
259 /// Returns configured writer version.
260 ///
261 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
262 pub fn writer_version(&self) -> WriterVersion {
263 self.writer_version
264 }
265
266 /// Returns `created_by` string.
267 ///
268 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
269 pub fn created_by(&self) -> &str {
270 &self.created_by
271 }
272
273 /// Returns `true` if offset index writing is disabled.
274 ///
275 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
276 pub fn offset_index_disabled(&self) -> bool {
277 // If page statistics are to be collected, then do not disable the offset indexes.
278 let default_page_stats_enabled =
279 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
280 let column_page_stats_enabled = self
281 .column_properties
282 .iter()
283 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
284 if default_page_stats_enabled || column_page_stats_enabled {
285 return false;
286 }
287
288 self.offset_index_disabled
289 }
290
291 /// Returns `key_value_metadata` KeyValue pairs.
292 ///
293 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
294 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
295 self.key_value_metadata.as_ref()
296 }
297
298 /// Returns sorting columns.
299 ///
300 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
301 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
302 self.sorting_columns.as_ref()
303 }
304
305 /// Returns the maximum length of truncated min/max values in the column index.
306 ///
307 /// `None` if truncation is disabled, must be greater than 0 otherwise.
308 ///
309 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
310 pub fn column_index_truncate_length(&self) -> Option<usize> {
311 self.column_index_truncate_length
312 }
313
314 /// Returns the maximum length of truncated min/max values in [`Statistics`].
315 ///
316 /// `None` if truncation is disabled, must be greater than 0 otherwise.
317 ///
318 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
319 ///
320 /// [`Statistics`]: crate::file::statistics::Statistics
321 pub fn statistics_truncate_length(&self) -> Option<usize> {
322 self.statistics_truncate_length
323 }
324
325 /// Returns `true` if type coercion is enabled.
326 ///
327 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
328 pub fn coerce_types(&self) -> bool {
329 self.coerce_types
330 }
331
332 /// Returns encoding for a data page, when dictionary encoding is enabled.
333 ///
334 /// This is not configurable.
335 #[inline]
336 pub fn dictionary_data_page_encoding(&self) -> Encoding {
337 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
338 // Dictionary values are encoded using RLE_DICTIONARY encoding.
339 Encoding::RLE_DICTIONARY
340 }
341
342 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
343 ///
344 /// This is not configurable.
345 #[inline]
346 pub fn dictionary_page_encoding(&self) -> Encoding {
347 // PLAIN_DICTIONARY is deprecated in writer version 1.
348 // Dictionary is encoded using plain encoding.
349 Encoding::PLAIN
350 }
351
352 /// Returns encoding for a column, if set.
353 ///
354 /// In case when dictionary is enabled, returns fallback encoding.
355 ///
356 /// If encoding is not set, then column writer will choose the best encoding
357 /// based on the column type.
358 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
359 self.column_properties
360 .get(col)
361 .and_then(|c| c.encoding())
362 .or_else(|| self.default_column_properties.encoding())
363 }
364
365 /// Returns compression codec for a column.
366 ///
367 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
368 pub fn compression(&self, col: &ColumnPath) -> Compression {
369 self.column_properties
370 .get(col)
371 .and_then(|c| c.compression())
372 .or_else(|| self.default_column_properties.compression())
373 .unwrap_or(DEFAULT_COMPRESSION)
374 }
375
376 /// Returns `true` if dictionary encoding is enabled for a column.
377 ///
378 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
379 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
380 self.column_properties
381 .get(col)
382 .and_then(|c| c.dictionary_enabled())
383 .or_else(|| self.default_column_properties.dictionary_enabled())
384 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
385 }
386
387 /// Returns which statistics are written for a column.
388 ///
389 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
390 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
391 self.column_properties
392 .get(col)
393 .and_then(|c| c.statistics_enabled())
394 .or_else(|| self.default_column_properties.statistics_enabled())
395 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
396 }
397
398 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
399 ///
400 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
401 ///
402 /// [`Statistics`]: crate::file::statistics::Statistics
403 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
404 self.column_properties
405 .get(col)
406 .and_then(|c| c.write_page_header_statistics())
407 .or_else(|| {
408 self.default_column_properties
409 .write_page_header_statistics()
410 })
411 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
412 }
413
414 /// Returns the [`BloomFilterProperties`] for the given column
415 ///
416 /// Returns `None` if bloom filter is disabled
417 ///
418 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
419 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
420 self.column_properties
421 .get(col)
422 .and_then(|c| c.bloom_filter_properties())
423 .or_else(|| self.default_column_properties.bloom_filter_properties())
424 }
425
426 /// Return file encryption properties
427 ///
428 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
429 #[cfg(feature = "encryption")]
430 pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
431 self.file_encryption_properties.as_ref()
432 }
433}
434
435/// Builder for [`WriterProperties`] Parquet writer configuration.
436///
437/// See example on [`WriterProperties`]
438pub struct WriterPropertiesBuilder {
439 data_page_size_limit: usize,
440 data_page_row_count_limit: usize,
441 write_batch_size: usize,
442 max_row_group_size: usize,
443 bloom_filter_position: BloomFilterPosition,
444 writer_version: WriterVersion,
445 created_by: String,
446 offset_index_disabled: bool,
447 key_value_metadata: Option<Vec<KeyValue>>,
448 default_column_properties: ColumnProperties,
449 column_properties: HashMap<ColumnPath, ColumnProperties>,
450 sorting_columns: Option<Vec<SortingColumn>>,
451 column_index_truncate_length: Option<usize>,
452 statistics_truncate_length: Option<usize>,
453 coerce_types: bool,
454 #[cfg(feature = "encryption")]
455 file_encryption_properties: Option<FileEncryptionProperties>,
456}
457
458impl Default for WriterPropertiesBuilder {
459 /// Returns default state of the builder.
460 fn default() -> Self {
461 Self {
462 data_page_size_limit: DEFAULT_PAGE_SIZE,
463 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
464 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
465 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
466 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
467 writer_version: DEFAULT_WRITER_VERSION,
468 created_by: DEFAULT_CREATED_BY.to_string(),
469 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
470 key_value_metadata: None,
471 default_column_properties: Default::default(),
472 column_properties: HashMap::new(),
473 sorting_columns: None,
474 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
475 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
476 coerce_types: DEFAULT_COERCE_TYPES,
477 #[cfg(feature = "encryption")]
478 file_encryption_properties: None,
479 }
480 }
481}
482
483impl WriterPropertiesBuilder {
484 /// Finalizes the configuration and returns immutable writer properties struct.
485 pub fn build(self) -> WriterProperties {
486 WriterProperties {
487 data_page_size_limit: self.data_page_size_limit,
488 data_page_row_count_limit: self.data_page_row_count_limit,
489 write_batch_size: self.write_batch_size,
490 max_row_group_size: self.max_row_group_size,
491 bloom_filter_position: self.bloom_filter_position,
492 writer_version: self.writer_version,
493 created_by: self.created_by,
494 offset_index_disabled: self.offset_index_disabled,
495 key_value_metadata: self.key_value_metadata,
496 default_column_properties: self.default_column_properties,
497 column_properties: self.column_properties,
498 sorting_columns: self.sorting_columns,
499 column_index_truncate_length: self.column_index_truncate_length,
500 statistics_truncate_length: self.statistics_truncate_length,
501 coerce_types: self.coerce_types,
502 #[cfg(feature = "encryption")]
503 file_encryption_properties: self.file_encryption_properties,
504 }
505 }
506
507 // ----------------------------------------------------------------------
508 // Writer properties related to a file
509
510 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
511 /// via [`DEFAULT_WRITER_VERSION`])
512 ///
513 /// This value can determine what features some readers will support.
514 ///
515 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
516 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
517 self.writer_version = value;
518 self
519 }
520
521 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
522 /// via [`DEFAULT_PAGE_SIZE`]).
523 ///
524 /// The parquet writer will attempt to limit the sizes of each
525 /// `DataPage` to this many bytes. Reducing this value will result
526 /// in larger parquet files, but may improve the effectiveness of
527 /// page index based predicate pushdown during reading.
528 ///
529 /// Note: this is a best effort limit based on value of
530 /// [`set_write_batch_size`](Self::set_write_batch_size).
531 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
532 self.data_page_size_limit = value;
533 self
534 }
535
536 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
537 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
538 ///
539 /// The parquet writer will attempt to limit the number of rows in
540 /// each `DataPage` to this value. Reducing this value will result
541 /// in larger parquet files, but may improve the effectiveness of
542 /// page index based predicate pushdown during reading.
543 ///
544 /// Note: this is a best effort limit based on value of
545 /// [`set_write_batch_size`](Self::set_write_batch_size).
546 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
547 self.data_page_row_count_limit = value;
548 self
549 }
550
551 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
552 ///
553 /// For performance reasons, data for each column is written in
554 /// batches of this size.
555 ///
556 /// Additional limits such as such as
557 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
558 /// are checked between batches, and thus the write batch size value acts as an
559 /// upper-bound on the enforcement granularity of other limits.
560 pub fn set_write_batch_size(mut self, value: usize) -> Self {
561 self.write_batch_size = value;
562 self
563 }
564
565 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
566 /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
567 ///
568 /// # Panics
569 /// If the value is set to 0.
570 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
571 assert!(value > 0, "Cannot have a 0 max row group size");
572 self.max_row_group_size = value;
573 self
574 }
575
576 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
577 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
578 ///
579 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
580 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
581 self.bloom_filter_position = value;
582 self
583 }
584
585 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
586 /// [`DEFAULT_CREATED_BY`]).
587 ///
588 /// This is a string that will be written into the file metadata
589 pub fn set_created_by(mut self, value: String) -> Self {
590 self.created_by = value;
591 self
592 }
593
594 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
595 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
596 ///
597 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
598 ///
599 /// Note: As the offset indexes are useful for accessing data by row number,
600 /// they are always written by default, regardless of whether other statistics
601 /// are enabled. Disabling this metadata may result in a degradation in read
602 /// performance, so use this option with care.
603 ///
604 /// [`Page`]: EnabledStatistics::Page
605 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
606 self.offset_index_disabled = value;
607 self
608 }
609
610 /// Sets "key_value_metadata" property (defaults to `None`).
611 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
612 self.key_value_metadata = value;
613 self
614 }
615
616 /// Sets sorting order of rows in the row group if any (defaults to `None`).
617 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
618 self.sorting_columns = value;
619 self
620 }
621
622 /// Sets the max length of min/max value fields when writing the column
623 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
624 ///
625 /// This can be used to prevent columns with very long values (hundreds of
626 /// bytes long) from causing the parquet metadata to become huge.
627 ///
628 /// # Notes
629 ///
630 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
631 /// set to [`EnabledStatistics::Page`].
632 ///
633 /// * If `Some`, must be greater than 0, otherwise will panic
634 /// * If `None`, there's no effective limit.
635 ///
636 /// [`Index`]: crate::file::page_index::index::Index
637 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
638 if let Some(value) = max_length {
639 assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
640 }
641
642 self.column_index_truncate_length = max_length;
643 self
644 }
645
646 /// Sets the max length of min/max value fields in row group and data page header
647 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
648 ///
649 /// # Notes
650 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
651 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
652 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
653 /// [`EnabledStatistics::Page`].
654 ///
655 /// * If `Some`, must be greater than 0, otherwise will panic
656 /// * If `None`, there's no effective limit.
657 ///
658 /// # See also
659 /// Truncation of Page Index statistics is controlled separately via
660 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
661 ///
662 /// [`Statistics`]: crate::file::statistics::Statistics
663 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
664 if let Some(value) = max_length {
665 assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
666 }
667
668 self.statistics_truncate_length = max_length;
669 self
670 }
671
672 /// Should the writer coerce types to parquet native types (defaults to `false` via
673 /// [`DEFAULT_COERCE_TYPES`]).
674 ///
675 /// Leaving this option the default `false` will ensure the exact same data
676 /// written to parquet using this library will be read.
677 ///
678 /// Setting this option to `true` will result in parquet files that can be
679 /// read by more readers, but potentially lose information in the process.
680 ///
681 /// * Types such as [`DataType::Date64`], which have no direct corresponding
682 /// Parquet type, may be stored with lower precision.
683 ///
684 /// * The internal field names of `List` and `Map` types will be renamed if
685 /// necessary to match what is required by the newest Parquet specification.
686 ///
687 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
688 ///
689 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
690 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
691 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
692 self.coerce_types = coerce_types;
693 self
694 }
695
696 /// Sets FileEncryptionProperties (defaults to `None`)
697 #[cfg(feature = "encryption")]
698 pub fn with_file_encryption_properties(
699 mut self,
700 file_encryption_properties: FileEncryptionProperties,
701 ) -> Self {
702 self.file_encryption_properties = Some(file_encryption_properties);
703 self
704 }
705
706 // ----------------------------------------------------------------------
707 // Setters for any column (global)
708
709 /// Sets default encoding for all columns.
710 ///
711 /// If dictionary is not enabled, this is treated as a primary encoding for all
712 /// columns. In case when dictionary is enabled for any column, this value is
713 /// considered to be a fallback encoding for that column.
714 ///
715 /// # Panics
716 ///
717 /// if dictionary encoding is specified, regardless of dictionary
718 /// encoding flag being set.
719 pub fn set_encoding(mut self, value: Encoding) -> Self {
720 self.default_column_properties.set_encoding(value);
721 self
722 }
723
724 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
725 /// [`DEFAULT_COMPRESSION`]).
726 ///
727 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
728 pub fn set_compression(mut self, value: Compression) -> Self {
729 self.default_column_properties.set_compression(value);
730 self
731 }
732
733 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
734 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
735 ///
736 /// Use this method to set dictionary encoding, instead of explicitly specifying
737 /// encoding in `set_encoding` method.
738 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
739 self.default_column_properties.set_dictionary_enabled(value);
740 self
741 }
742
743 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
744 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
745 ///
746 /// The parquet writer will attempt to limit the size of each
747 /// `DataPage` used to store dictionaries to this many
748 /// bytes. Reducing this value will result in larger parquet
749 /// files, but may improve the effectiveness of page index based
750 /// predicate pushdown during reading.
751 ///
752 /// Note: this is a best effort limit based on value of
753 /// [`set_write_batch_size`](Self::set_write_batch_size).
754 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
755 self.default_column_properties
756 .set_dictionary_page_size_limit(value);
757 self
758 }
759
760 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
761 /// [`DEFAULT_STATISTICS_ENABLED`]).
762 ///
763 /// [`Page`]: EnabledStatistics::Page
764 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
765 self.default_column_properties.set_statistics_enabled(value);
766 self
767 }
768
769 /// enable/disable writing [`Statistics`] in the page header
770 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
771 ///
772 /// Only applicable if [`Page`] level statistics are gathered.
773 ///
774 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
775 /// file while yielding very little added benefit. Most modern Parquet implementations
776 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
777 /// those in the page header.
778 ///
779 /// # Note
780 ///
781 /// Prior to version 56.0.0, the `parquet` crate always wrote these
782 /// statistics (the equivalent of setting this option to `true`). This was
783 /// changed in 56.0.0 to follow the recommendation in the Parquet
784 /// specification. See [issue #7580] for more details.
785 ///
786 /// [`Statistics`]: crate::file::statistics::Statistics
787 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
788 /// [`Page`]: EnabledStatistics::Page
789 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
790 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
791 self.default_column_properties
792 .set_write_page_header_statistics(value);
793 self
794 }
795
796 /// Sets if bloom filter should be written for all columns (defaults to `false`).
797 ///
798 /// # Notes
799 ///
800 /// * If the bloom filter is enabled previously then it is a no-op.
801 ///
802 /// * If the bloom filter is not enabled, default values for ndv and fpp
803 /// value are used used. See [`set_bloom_filter_ndv`] and
804 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
805 ///
806 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
807 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
808 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
809 self.default_column_properties
810 .set_bloom_filter_enabled(value);
811 self
812 }
813
814 /// Sets the default target bloom filter false positive probability (fpp)
815 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
816 ///
817 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
818 /// been called.
819 ///
820 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
821 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
822 self.default_column_properties.set_bloom_filter_fpp(value);
823 self
824 }
825
826 /// Sets default number of distinct values (ndv) for bloom filter for all
827 /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
828 ///
829 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
830 /// been called.
831 ///
832 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
833 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
834 self.default_column_properties.set_bloom_filter_ndv(value);
835 self
836 }
837
838 // ----------------------------------------------------------------------
839 // Setters for a specific column
840
841 /// Helper method to get existing or new mutable reference of column properties.
842 #[inline]
843 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
844 self.column_properties.entry(col).or_default()
845 }
846
847 /// Sets encoding for a specific column.
848 ///
849 /// Takes precedence over [`Self::set_encoding`].
850 ///
851 /// If dictionary is not enabled, this is treated as a primary encoding for this
852 /// column. In case when dictionary is enabled for this column, either through
853 /// global defaults or explicitly, this value is considered to be a fallback
854 /// encoding for this column.
855 ///
856 /// # Panics
857 /// If user tries to set dictionary encoding here, regardless of dictionary
858 /// encoding flag being set.
859 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
860 self.get_mut_props(col).set_encoding(value);
861 self
862 }
863
864 /// Sets compression codec for a specific column.
865 ///
866 /// Takes precedence over [`Self::set_compression`].
867 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
868 self.get_mut_props(col).set_compression(value);
869 self
870 }
871
872 /// Sets flag to enable/disable dictionary encoding for a specific column.
873 ///
874 /// Takes precedence over [`Self::set_dictionary_enabled`].
875 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
876 self.get_mut_props(col).set_dictionary_enabled(value);
877 self
878 }
879
880 /// Sets dictionary page size limit for a specific column.
881 ///
882 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
883 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
884 self.get_mut_props(col)
885 .set_dictionary_page_size_limit(value);
886 self
887 }
888
889 /// Sets [`EnabledStatistics`] level for a specific column.
890 ///
891 /// Takes precedence over [`Self::set_statistics_enabled`].
892 pub fn set_column_statistics_enabled(
893 mut self,
894 col: ColumnPath,
895 value: EnabledStatistics,
896 ) -> Self {
897 self.get_mut_props(col).set_statistics_enabled(value);
898 self
899 }
900
901 /// Sets whether to write [`Statistics`] in the page header for a specific column.
902 ///
903 /// Takes precedence over [`Self::set_write_page_header_statistics`].
904 ///
905 /// [`Statistics`]: crate::file::statistics::Statistics
906 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
907 self.get_mut_props(col)
908 .set_write_page_header_statistics(value);
909 self
910 }
911
912 /// Sets whether a bloom filter should be written for a specific column.
913 ///
914 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
915 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
916 self.get_mut_props(col).set_bloom_filter_enabled(value);
917 self
918 }
919
920 /// Sets the false positive probability for bloom filter for a specific column.
921 ///
922 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
923 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
924 self.get_mut_props(col).set_bloom_filter_fpp(value);
925 self
926 }
927
928 /// Sets the number of distinct values for bloom filter for a specific column.
929 ///
930 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
931 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
932 self.get_mut_props(col).set_bloom_filter_ndv(value);
933 self
934 }
935}
936
937/// Controls the level of statistics to be computed by the writer and stored in
938/// the parquet file.
939///
940/// Enabling statistics makes the resulting Parquet file larger and requires
941/// more time to read the parquet footer.
942///
943/// Statistics can be used to improve query performance by pruning row groups
944/// and pages during query execution if the query engine supports evaluating the
945/// predicate using the statistics.
946#[derive(Debug, Clone, Copy, Eq, PartialEq)]
947pub enum EnabledStatistics {
948 /// Compute no statistics.
949 None,
950 /// Compute column chunk-level statistics but not page-level.
951 ///
952 /// Setting this option will store one set of statistics for each relevant
953 /// column for each row group. The more row groups written, the more
954 /// statistics will be stored.
955 Chunk,
956 /// Compute page-level and column chunk-level statistics.
957 ///
958 /// Setting this option will store one set of statistics for each relevant
959 /// column for each row group. In addition, this will enable the writing
960 /// of the column index (the offset index is always written regardless of
961 /// this setting). See [`ParquetColumnIndex`] for
962 /// more information.
963 ///
964 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
965 Page,
966}
967
968impl FromStr for EnabledStatistics {
969 type Err = String;
970
971 fn from_str(s: &str) -> Result<Self, Self::Err> {
972 match s {
973 "NONE" | "none" => Ok(EnabledStatistics::None),
974 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
975 "PAGE" | "page" => Ok(EnabledStatistics::Page),
976 _ => Err(format!("Invalid statistics arg: {s}")),
977 }
978 }
979}
980
981impl Default for EnabledStatistics {
982 fn default() -> Self {
983 DEFAULT_STATISTICS_ENABLED
984 }
985}
986
987/// Controls the bloom filter to be computed by the writer.
988#[derive(Debug, Clone, PartialEq)]
989pub struct BloomFilterProperties {
990 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
991 ///
992 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
993 ///
994 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
995 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
996 /// e.g. 0.1, 0.05, or 0.001 is recommended.
997 ///
998 /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
999 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1000 /// be known in advance to greatly reduce space usage.
1001 pub fpp: f64,
1002 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1003 ///
1004 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1005 ///
1006 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1007 /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1008 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1009 /// anyway.
1010 ///
1011 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1012 pub ndv: u64,
1013}
1014
1015impl Default for BloomFilterProperties {
1016 fn default() -> Self {
1017 BloomFilterProperties {
1018 fpp: DEFAULT_BLOOM_FILTER_FPP,
1019 ndv: DEFAULT_BLOOM_FILTER_NDV,
1020 }
1021 }
1022}
1023
1024/// Container for column properties that can be changed as part of writer.
1025///
1026/// If a field is `None`, it means that no specific value has been set for this column,
1027/// so some subsequent or default value must be used.
1028#[derive(Debug, Clone, Default, PartialEq)]
1029struct ColumnProperties {
1030 encoding: Option<Encoding>,
1031 codec: Option<Compression>,
1032 dictionary_page_size_limit: Option<usize>,
1033 dictionary_enabled: Option<bool>,
1034 statistics_enabled: Option<EnabledStatistics>,
1035 write_page_header_statistics: Option<bool>,
1036 /// bloom filter related properties
1037 bloom_filter_properties: Option<BloomFilterProperties>,
1038}
1039
1040impl ColumnProperties {
1041 /// Sets encoding for this column.
1042 ///
1043 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1044 /// In case when dictionary is enabled for a column, this value is considered to
1045 /// be a fallback encoding.
1046 ///
1047 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1048 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1049 /// for a column.
1050 fn set_encoding(&mut self, value: Encoding) {
1051 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1052 panic!("Dictionary encoding can not be used as fallback encoding");
1053 }
1054 self.encoding = Some(value);
1055 }
1056
1057 /// Sets compression codec for this column.
1058 fn set_compression(&mut self, value: Compression) {
1059 self.codec = Some(value);
1060 }
1061
1062 /// Sets whether dictionary encoding is enabled for this column.
1063 fn set_dictionary_enabled(&mut self, enabled: bool) {
1064 self.dictionary_enabled = Some(enabled);
1065 }
1066
1067 /// Sets dictionary page size limit for this column.
1068 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1069 self.dictionary_page_size_limit = Some(value);
1070 }
1071
1072 /// Sets the statistics level for this column.
1073 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1074 self.statistics_enabled = Some(enabled);
1075 }
1076
1077 /// Sets whether to write statistics in the page header for this column.
1078 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1079 self.write_page_header_statistics = Some(enabled);
1080 }
1081
1082 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1083 /// otherwise it is a no-op.
1084 /// If `value` is `false`, resets bloom filter properties to `None`.
1085 fn set_bloom_filter_enabled(&mut self, value: bool) {
1086 if value && self.bloom_filter_properties.is_none() {
1087 self.bloom_filter_properties = Some(Default::default())
1088 } else if !value {
1089 self.bloom_filter_properties = None
1090 }
1091 }
1092
1093 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1094 /// bloom filter if not previously enabled.
1095 ///
1096 /// # Panics
1097 ///
1098 /// Panics if the `value` is not between 0 and 1 exclusive
1099 fn set_bloom_filter_fpp(&mut self, value: f64) {
1100 assert!(
1101 value > 0. && value < 1.0,
1102 "fpp must be between 0 and 1 exclusive, got {value}"
1103 );
1104
1105 self.bloom_filter_properties
1106 .get_or_insert_with(Default::default)
1107 .fpp = value;
1108 }
1109
1110 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1111 /// enables bloom filter if not previously enabled.
1112 fn set_bloom_filter_ndv(&mut self, value: u64) {
1113 self.bloom_filter_properties
1114 .get_or_insert_with(Default::default)
1115 .ndv = value;
1116 }
1117
1118 /// Returns optional encoding for this column.
1119 fn encoding(&self) -> Option<Encoding> {
1120 self.encoding
1121 }
1122
1123 /// Returns optional compression codec for this column.
1124 fn compression(&self) -> Option<Compression> {
1125 self.codec
1126 }
1127
1128 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1129 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1130 /// been provided.
1131 fn dictionary_enabled(&self) -> Option<bool> {
1132 self.dictionary_enabled
1133 }
1134
1135 /// Returns optional dictionary page size limit for this column.
1136 fn dictionary_page_size_limit(&self) -> Option<usize> {
1137 self.dictionary_page_size_limit
1138 }
1139
1140 /// Returns optional statistics level requested for this column. If result is `None`,
1141 /// then no setting has been provided.
1142 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1143 self.statistics_enabled
1144 }
1145
1146 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1147 /// column.
1148 ///
1149 /// [`Statistics`]: crate::file::statistics::Statistics
1150 fn write_page_header_statistics(&self) -> Option<bool> {
1151 self.write_page_header_statistics
1152 }
1153
1154 /// Returns the bloom filter properties, or `None` if not enabled
1155 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1156 self.bloom_filter_properties.as_ref()
1157 }
1158}
1159
1160/// Reference counted reader properties.
1161pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1162
1163const DEFAULT_READ_BLOOM_FILTER: bool = false;
1164
1165/// Configuration settings for reading parquet files.
1166///
1167/// All properties are immutable and `Send` + `Sync`.
1168/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1169///
1170/// # Example
1171///
1172/// ```rust
1173/// use parquet::file::properties::ReaderProperties;
1174///
1175/// // Create properties with default configuration.
1176/// let props = ReaderProperties::builder().build();
1177///
1178/// // Use properties builder to set certain options and assemble the configuration.
1179/// let props = ReaderProperties::builder()
1180/// .set_backward_compatible_lz4(false)
1181/// .build();
1182/// ```
1183pub struct ReaderProperties {
1184 codec_options: CodecOptions,
1185 read_bloom_filter: bool,
1186}
1187
1188impl ReaderProperties {
1189 /// Returns builder for reader properties with default values.
1190 pub fn builder() -> ReaderPropertiesBuilder {
1191 ReaderPropertiesBuilder::with_defaults()
1192 }
1193
1194 /// Returns codec options.
1195 pub(crate) fn codec_options(&self) -> &CodecOptions {
1196 &self.codec_options
1197 }
1198
1199 /// Returns whether to read bloom filter
1200 pub(crate) fn read_bloom_filter(&self) -> bool {
1201 self.read_bloom_filter
1202 }
1203}
1204
1205/// Builder for parquet file reader configuration. See example on
1206/// [`ReaderProperties`]
1207pub struct ReaderPropertiesBuilder {
1208 codec_options_builder: CodecOptionsBuilder,
1209 read_bloom_filter: Option<bool>,
1210}
1211
1212/// Reader properties builder.
1213impl ReaderPropertiesBuilder {
1214 /// Returns default state of the builder.
1215 fn with_defaults() -> Self {
1216 Self {
1217 codec_options_builder: CodecOptionsBuilder::default(),
1218 read_bloom_filter: None,
1219 }
1220 }
1221
1222 /// Finalizes the configuration and returns immutable reader properties struct.
1223 pub fn build(self) -> ReaderProperties {
1224 ReaderProperties {
1225 codec_options: self.codec_options_builder.build(),
1226 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1227 }
1228 }
1229
1230 /// Enable/disable backward compatible LZ4.
1231 ///
1232 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1233 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1234 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1235 /// compatibility with files generated by older versions of parquet-cpp.
1236 ///
1237 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1238 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1239 self.codec_options_builder = self
1240 .codec_options_builder
1241 .set_backward_compatible_lz4(value);
1242 self
1243 }
1244
1245 /// Enable/disable reading bloom filter
1246 ///
1247 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1248 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1249 ///
1250 /// By default bloom filter is set to be read.
1251 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1252 self.read_bloom_filter = Some(value);
1253 self
1254 }
1255}
1256
1257#[cfg(test)]
1258mod tests {
1259 use super::*;
1260
1261 #[test]
1262 fn test_writer_version() {
1263 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1264 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1265 }
1266
1267 #[test]
1268 fn test_writer_properties_default_settings() {
1269 let props = WriterProperties::default();
1270 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1271 assert_eq!(
1272 props.dictionary_page_size_limit(),
1273 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1274 );
1275 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1276 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1277 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1278 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1279 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1280 assert_eq!(props.key_value_metadata(), None);
1281 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1282 assert_eq!(
1283 props.compression(&ColumnPath::from("col")),
1284 DEFAULT_COMPRESSION
1285 );
1286 assert_eq!(
1287 props.dictionary_enabled(&ColumnPath::from("col")),
1288 DEFAULT_DICTIONARY_ENABLED
1289 );
1290 assert_eq!(
1291 props.statistics_enabled(&ColumnPath::from("col")),
1292 DEFAULT_STATISTICS_ENABLED
1293 );
1294 assert!(props
1295 .bloom_filter_properties(&ColumnPath::from("col"))
1296 .is_none());
1297 }
1298
1299 #[test]
1300 fn test_writer_properties_dictionary_encoding() {
1301 // dictionary encoding is not configurable, and it should be the same for both
1302 // writer version 1 and 2.
1303 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1304 let props = WriterProperties::builder()
1305 .set_writer_version(*version)
1306 .build();
1307 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1308 assert_eq!(
1309 props.dictionary_data_page_encoding(),
1310 Encoding::RLE_DICTIONARY
1311 );
1312 }
1313 }
1314
1315 #[test]
1316 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1317 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1318 // Should panic when user specifies dictionary encoding as fallback encoding.
1319 WriterProperties::builder()
1320 .set_encoding(Encoding::PLAIN_DICTIONARY)
1321 .build();
1322 }
1323
1324 #[test]
1325 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1326 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1327 // Should panic when user specifies dictionary encoding as fallback encoding.
1328 WriterProperties::builder()
1329 .set_encoding(Encoding::RLE_DICTIONARY)
1330 .build();
1331 }
1332
1333 #[test]
1334 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1335 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1336 WriterProperties::builder()
1337 .set_dictionary_enabled(true)
1338 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1339 .build();
1340 }
1341
1342 #[test]
1343 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1344 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1345 WriterProperties::builder()
1346 .set_dictionary_enabled(false)
1347 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1348 .build();
1349 }
1350
1351 #[test]
1352 fn test_writer_properties_builder() {
1353 let props = WriterProperties::builder()
1354 // file settings
1355 .set_writer_version(WriterVersion::PARQUET_2_0)
1356 .set_data_page_size_limit(10)
1357 .set_dictionary_page_size_limit(20)
1358 .set_write_batch_size(30)
1359 .set_max_row_group_size(40)
1360 .set_created_by("default".to_owned())
1361 .set_key_value_metadata(Some(vec![KeyValue::new(
1362 "key".to_string(),
1363 "value".to_string(),
1364 )]))
1365 // global column settings
1366 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1367 .set_compression(Compression::GZIP(Default::default()))
1368 .set_dictionary_enabled(false)
1369 .set_statistics_enabled(EnabledStatistics::None)
1370 // specific column settings
1371 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1372 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1373 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1374 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1375 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1376 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1377 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1378 .build();
1379
1380 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1381 assert_eq!(props.data_page_size_limit(), 10);
1382 assert_eq!(props.dictionary_page_size_limit(), 20);
1383 assert_eq!(props.write_batch_size(), 30);
1384 assert_eq!(props.max_row_group_size(), 40);
1385 assert_eq!(props.created_by(), "default");
1386 assert_eq!(
1387 props.key_value_metadata(),
1388 Some(&vec![
1389 KeyValue::new("key".to_string(), "value".to_string(),)
1390 ])
1391 );
1392
1393 assert_eq!(
1394 props.encoding(&ColumnPath::from("a")),
1395 Some(Encoding::DELTA_BINARY_PACKED)
1396 );
1397 assert_eq!(
1398 props.compression(&ColumnPath::from("a")),
1399 Compression::GZIP(Default::default())
1400 );
1401 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1402 assert_eq!(
1403 props.statistics_enabled(&ColumnPath::from("a")),
1404 EnabledStatistics::None
1405 );
1406
1407 assert_eq!(
1408 props.encoding(&ColumnPath::from("col")),
1409 Some(Encoding::RLE)
1410 );
1411 assert_eq!(
1412 props.compression(&ColumnPath::from("col")),
1413 Compression::SNAPPY
1414 );
1415 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1416 assert_eq!(
1417 props.statistics_enabled(&ColumnPath::from("col")),
1418 EnabledStatistics::Chunk
1419 );
1420 assert_eq!(
1421 props.bloom_filter_properties(&ColumnPath::from("col")),
1422 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1423 );
1424 }
1425
1426 #[test]
1427 fn test_writer_properties_builder_partial_defaults() {
1428 let props = WriterProperties::builder()
1429 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1430 .set_compression(Compression::GZIP(Default::default()))
1431 .set_bloom_filter_enabled(true)
1432 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1433 .build();
1434
1435 assert_eq!(
1436 props.encoding(&ColumnPath::from("col")),
1437 Some(Encoding::RLE)
1438 );
1439 assert_eq!(
1440 props.compression(&ColumnPath::from("col")),
1441 Compression::GZIP(Default::default())
1442 );
1443 assert_eq!(
1444 props.dictionary_enabled(&ColumnPath::from("col")),
1445 DEFAULT_DICTIONARY_ENABLED
1446 );
1447 assert_eq!(
1448 props.bloom_filter_properties(&ColumnPath::from("col")),
1449 Some(&BloomFilterProperties {
1450 fpp: 0.05,
1451 ndv: 1_000_000_u64
1452 })
1453 );
1454 }
1455
1456 #[test]
1457 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1458 assert_eq!(
1459 WriterProperties::builder()
1460 .build()
1461 .bloom_filter_properties(&ColumnPath::from("col")),
1462 None
1463 );
1464 assert_eq!(
1465 WriterProperties::builder()
1466 .set_bloom_filter_ndv(100)
1467 .build()
1468 .bloom_filter_properties(&ColumnPath::from("col")),
1469 Some(&BloomFilterProperties {
1470 fpp: 0.05,
1471 ndv: 100
1472 })
1473 );
1474 assert_eq!(
1475 WriterProperties::builder()
1476 .set_bloom_filter_fpp(0.1)
1477 .build()
1478 .bloom_filter_properties(&ColumnPath::from("col")),
1479 Some(&BloomFilterProperties {
1480 fpp: 0.1,
1481 ndv: 1_000_000_u64
1482 })
1483 );
1484 }
1485
1486 #[test]
1487 fn test_writer_properties_column_dictionary_page_size_limit() {
1488 let props = WriterProperties::builder()
1489 .set_dictionary_page_size_limit(100)
1490 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1491 .build();
1492
1493 assert_eq!(props.dictionary_page_size_limit(), 100);
1494 assert_eq!(
1495 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1496 10
1497 );
1498 assert_eq!(
1499 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1500 100
1501 );
1502 }
1503
1504 #[test]
1505 fn test_reader_properties_default_settings() {
1506 let props = ReaderProperties::builder().build();
1507
1508 let codec_options = CodecOptionsBuilder::default()
1509 .set_backward_compatible_lz4(true)
1510 .build();
1511
1512 assert_eq!(props.codec_options(), &codec_options);
1513 assert!(!props.read_bloom_filter());
1514 }
1515
1516 #[test]
1517 fn test_reader_properties_builder() {
1518 let props = ReaderProperties::builder()
1519 .set_backward_compatible_lz4(false)
1520 .build();
1521
1522 let codec_options = CodecOptionsBuilder::default()
1523 .set_backward_compatible_lz4(false)
1524 .build();
1525
1526 assert_eq!(props.codec_options(), &codec_options);
1527 }
1528
1529 #[test]
1530 fn test_parse_writerversion() {
1531 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1532 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1533 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1534 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1535
1536 // test lowercase
1537 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1538 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1539
1540 // test invalid version
1541 match "PARQUET_-1_0".parse::<WriterVersion>() {
1542 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1543 Err(e) => {
1544 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1545 }
1546 }
1547 }
1548
1549 #[test]
1550 fn test_parse_enabledstatistics() {
1551 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1552 assert_eq!(enabled_statistics, EnabledStatistics::None);
1553 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1554 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1555 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1556 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1557
1558 // test lowercase
1559 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1560 assert_eq!(enabled_statistics, EnabledStatistics::None);
1561
1562 //test invalid statistics
1563 match "ChunkAndPage".parse::<EnabledStatistics>() {
1564 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1565 Err(e) => {
1566 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1567 }
1568 }
1569 }
1570}