parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::{KeyValue, SortingColumn};
24use crate::schema::types::ColumnPath;
25use std::str::FromStr;
26use std::{collections::HashMap, sync::Arc};
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::write_page_header_statistics`]
45pub const DEFAULT_WRITE_PAGE_HEADER_STATISTICS: bool = false;
46/// Default value for [`WriterProperties::max_row_group_size`]
47pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
60/// Default value for [`WriterProperties::offset_index_disabled`]
61pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
62/// Default values for [`WriterProperties::coerce_types`]
63pub const DEFAULT_COERCE_TYPES: bool = false;
64
65/// Parquet writer version.
66///
67/// Basic constant, which is not part of the Thrift definition.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69#[allow(non_camel_case_types)]
70pub enum WriterVersion {
71 /// Parquet format version 1.0
72 PARQUET_1_0,
73 /// Parquet format version 2.0
74 PARQUET_2_0,
75}
76
77impl WriterVersion {
78 /// Returns writer version as `i32`.
79 pub fn as_num(&self) -> i32 {
80 match self {
81 WriterVersion::PARQUET_1_0 => 1,
82 WriterVersion::PARQUET_2_0 => 2,
83 }
84 }
85}
86
87impl FromStr for WriterVersion {
88 type Err = String;
89
90 fn from_str(s: &str) -> Result<Self, Self::Err> {
91 match s {
92 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
93 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
94 _ => Err(format!("Invalid writer version: {s}")),
95 }
96 }
97}
98
99/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
100/// write Bloom filters
101///
102/// Basic constant, which is not part of the Thrift definition.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum BloomFilterPosition {
105 /// Write Bloom Filters of each row group right after the row group
106 ///
107 /// This saves memory by writing it as soon as it is computed, at the cost
108 /// of data locality for readers
109 AfterRowGroup,
110 /// Write Bloom Filters at the end of the file
111 ///
112 /// This allows better data locality for readers, at the cost of memory usage
113 /// for writers.
114 End,
115}
116
117/// Reference counted writer properties.
118pub type WriterPropertiesPtr = Arc<WriterProperties>;
119
120/// Configuration settings for writing parquet files.
121///
122/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
123///
124/// # Example
125///
126/// ```rust
127/// # use parquet::{
128/// # basic::{Compression, Encoding},
129/// # file::properties::*,
130/// # schema::types::ColumnPath,
131/// # };
132/// #
133/// // Create properties with default configuration.
134/// let props = WriterProperties::default();
135///
136/// // Use properties builder to set certain options and assemble the configuration.
137/// let props = WriterProperties::builder()
138/// .set_writer_version(WriterVersion::PARQUET_1_0)
139/// .set_encoding(Encoding::PLAIN)
140/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
141/// .set_compression(Compression::SNAPPY)
142/// .build();
143///
144/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
145/// assert_eq!(
146/// props.encoding(&ColumnPath::from("col1")),
147/// Some(Encoding::DELTA_BINARY_PACKED)
148/// );
149/// assert_eq!(
150/// props.encoding(&ColumnPath::from("col2")),
151/// Some(Encoding::PLAIN)
152/// );
153/// ```
154#[derive(Debug, Clone)]
155pub struct WriterProperties {
156 data_page_size_limit: usize,
157 data_page_row_count_limit: usize,
158 write_batch_size: usize,
159 max_row_group_size: usize,
160 bloom_filter_position: BloomFilterPosition,
161 writer_version: WriterVersion,
162 created_by: String,
163 offset_index_disabled: bool,
164 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
165 default_column_properties: ColumnProperties,
166 column_properties: HashMap<ColumnPath, ColumnProperties>,
167 sorting_columns: Option<Vec<SortingColumn>>,
168 column_index_truncate_length: Option<usize>,
169 statistics_truncate_length: Option<usize>,
170 coerce_types: bool,
171 #[cfg(feature = "encryption")]
172 pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
173}
174
175impl Default for WriterProperties {
176 fn default() -> Self {
177 Self::builder().build()
178 }
179}
180
181impl WriterProperties {
182 /// Create a new [`WriterProperties`] with the default settings
183 ///
184 /// See [`WriterProperties::builder`] for customising settings
185 pub fn new() -> Self {
186 Self::default()
187 }
188
189 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
190 /// properties.
191 pub fn builder() -> WriterPropertiesBuilder {
192 WriterPropertiesBuilder::default()
193 }
194
195 /// Converts this [`WriterProperties`] into a [`WriterPropertiesBuilder`]
196 /// Used for mutating existing property settings
197 pub fn into_builder(self) -> WriterPropertiesBuilder {
198 self.into()
199 }
200
201 /// Returns data page size limit.
202 ///
203 /// Note: this is a best effort limit based on the write batch size
204 ///
205 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
206 pub fn data_page_size_limit(&self) -> usize {
207 self.data_page_size_limit
208 }
209
210 /// Returns dictionary page size limit.
211 ///
212 /// Note: this is a best effort limit based on the write batch size
213 ///
214 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
215 pub fn dictionary_page_size_limit(&self) -> usize {
216 self.default_column_properties
217 .dictionary_page_size_limit()
218 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
219 }
220
221 /// Returns dictionary page size limit for a specific column.
222 pub fn column_dictionary_page_size_limit(&self, col: &ColumnPath) -> usize {
223 self.column_properties
224 .get(col)
225 .and_then(|c| c.dictionary_page_size_limit())
226 .or_else(|| self.default_column_properties.dictionary_page_size_limit())
227 .unwrap_or(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT)
228 }
229
230 /// Returns the maximum page row count
231 ///
232 /// Note: this is a best effort limit based on the write batch size
233 ///
234 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
235 pub fn data_page_row_count_limit(&self) -> usize {
236 self.data_page_row_count_limit
237 }
238
239 /// Returns configured batch size for writes.
240 ///
241 /// When writing a batch of data, this setting allows to split it internally into
242 /// smaller batches so we can better estimate the size of a page currently being
243 /// written.
244 ///
245 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
246 pub fn write_batch_size(&self) -> usize {
247 self.write_batch_size
248 }
249
250 /// Returns maximum number of rows in a row group.
251 ///
252 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
253 pub fn max_row_group_size(&self) -> usize {
254 self.max_row_group_size
255 }
256
257 /// Returns bloom filter position.
258 ///
259 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
260 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
261 self.bloom_filter_position
262 }
263
264 /// Returns configured writer version.
265 ///
266 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
267 pub fn writer_version(&self) -> WriterVersion {
268 self.writer_version
269 }
270
271 /// Returns `created_by` string.
272 ///
273 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
274 pub fn created_by(&self) -> &str {
275 &self.created_by
276 }
277
278 /// Returns `true` if offset index writing is disabled.
279 ///
280 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
281 pub fn offset_index_disabled(&self) -> bool {
282 // If page statistics are to be collected, then do not disable the offset indexes.
283 let default_page_stats_enabled =
284 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
285 let column_page_stats_enabled = self
286 .column_properties
287 .iter()
288 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
289 if default_page_stats_enabled || column_page_stats_enabled {
290 return false;
291 }
292
293 self.offset_index_disabled
294 }
295
296 /// Returns `key_value_metadata` KeyValue pairs.
297 ///
298 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
299 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
300 self.key_value_metadata.as_ref()
301 }
302
303 /// Returns sorting columns.
304 ///
305 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
306 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
307 self.sorting_columns.as_ref()
308 }
309
310 /// Returns the maximum length of truncated min/max values in the column index.
311 ///
312 /// `None` if truncation is disabled, must be greater than 0 otherwise.
313 ///
314 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
315 pub fn column_index_truncate_length(&self) -> Option<usize> {
316 self.column_index_truncate_length
317 }
318
319 /// Returns the maximum length of truncated min/max values in [`Statistics`].
320 ///
321 /// `None` if truncation is disabled, must be greater than 0 otherwise.
322 ///
323 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
324 ///
325 /// [`Statistics`]: crate::file::statistics::Statistics
326 pub fn statistics_truncate_length(&self) -> Option<usize> {
327 self.statistics_truncate_length
328 }
329
330 /// Returns `true` if type coercion is enabled.
331 ///
332 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
333 pub fn coerce_types(&self) -> bool {
334 self.coerce_types
335 }
336
337 /// Returns encoding for a data page, when dictionary encoding is enabled.
338 ///
339 /// This is not configurable.
340 #[inline]
341 pub fn dictionary_data_page_encoding(&self) -> Encoding {
342 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
343 // Dictionary values are encoded using RLE_DICTIONARY encoding.
344 Encoding::RLE_DICTIONARY
345 }
346
347 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
348 ///
349 /// This is not configurable.
350 #[inline]
351 pub fn dictionary_page_encoding(&self) -> Encoding {
352 // PLAIN_DICTIONARY is deprecated in writer version 1.
353 // Dictionary is encoded using plain encoding.
354 Encoding::PLAIN
355 }
356
357 /// Returns encoding for a column, if set.
358 ///
359 /// In case when dictionary is enabled, returns fallback encoding.
360 ///
361 /// If encoding is not set, then column writer will choose the best encoding
362 /// based on the column type.
363 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
364 self.column_properties
365 .get(col)
366 .and_then(|c| c.encoding())
367 .or_else(|| self.default_column_properties.encoding())
368 }
369
370 /// Returns compression codec for a column.
371 ///
372 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
373 pub fn compression(&self, col: &ColumnPath) -> Compression {
374 self.column_properties
375 .get(col)
376 .and_then(|c| c.compression())
377 .or_else(|| self.default_column_properties.compression())
378 .unwrap_or(DEFAULT_COMPRESSION)
379 }
380
381 /// Returns `true` if dictionary encoding is enabled for a column.
382 ///
383 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
384 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
385 self.column_properties
386 .get(col)
387 .and_then(|c| c.dictionary_enabled())
388 .or_else(|| self.default_column_properties.dictionary_enabled())
389 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
390 }
391
392 /// Returns which statistics are written for a column.
393 ///
394 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
395 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
396 self.column_properties
397 .get(col)
398 .and_then(|c| c.statistics_enabled())
399 .or_else(|| self.default_column_properties.statistics_enabled())
400 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
401 }
402
403 /// Returns `true` if [`Statistics`] are to be written to the page header for a column.
404 ///
405 /// For more details see [`WriterPropertiesBuilder::set_write_page_header_statistics`]
406 ///
407 /// [`Statistics`]: crate::file::statistics::Statistics
408 pub fn write_page_header_statistics(&self, col: &ColumnPath) -> bool {
409 self.column_properties
410 .get(col)
411 .and_then(|c| c.write_page_header_statistics())
412 .or_else(|| {
413 self.default_column_properties
414 .write_page_header_statistics()
415 })
416 .unwrap_or(DEFAULT_WRITE_PAGE_HEADER_STATISTICS)
417 }
418
419 /// Returns the [`BloomFilterProperties`] for the given column
420 ///
421 /// Returns `None` if bloom filter is disabled
422 ///
423 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
424 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
425 self.column_properties
426 .get(col)
427 .and_then(|c| c.bloom_filter_properties())
428 .or_else(|| self.default_column_properties.bloom_filter_properties())
429 }
430
431 /// Return file encryption properties
432 ///
433 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
434 #[cfg(feature = "encryption")]
435 pub fn file_encryption_properties(&self) -> Option<&Arc<FileEncryptionProperties>> {
436 self.file_encryption_properties.as_ref()
437 }
438}
439
440/// Builder for [`WriterProperties`] Parquet writer configuration.
441///
442/// See example on [`WriterProperties`]
443#[derive(Debug, Clone)]
444pub struct WriterPropertiesBuilder {
445 data_page_size_limit: usize,
446 data_page_row_count_limit: usize,
447 write_batch_size: usize,
448 max_row_group_size: usize,
449 bloom_filter_position: BloomFilterPosition,
450 writer_version: WriterVersion,
451 created_by: String,
452 offset_index_disabled: bool,
453 key_value_metadata: Option<Vec<KeyValue>>,
454 default_column_properties: ColumnProperties,
455 column_properties: HashMap<ColumnPath, ColumnProperties>,
456 sorting_columns: Option<Vec<SortingColumn>>,
457 column_index_truncate_length: Option<usize>,
458 statistics_truncate_length: Option<usize>,
459 coerce_types: bool,
460 #[cfg(feature = "encryption")]
461 file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
462}
463
464impl Default for WriterPropertiesBuilder {
465 /// Returns default state of the builder.
466 fn default() -> Self {
467 Self {
468 data_page_size_limit: DEFAULT_PAGE_SIZE,
469 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
470 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
471 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
472 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
473 writer_version: DEFAULT_WRITER_VERSION,
474 created_by: DEFAULT_CREATED_BY.to_string(),
475 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
476 key_value_metadata: None,
477 default_column_properties: Default::default(),
478 column_properties: HashMap::new(),
479 sorting_columns: None,
480 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
481 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
482 coerce_types: DEFAULT_COERCE_TYPES,
483 #[cfg(feature = "encryption")]
484 file_encryption_properties: None,
485 }
486 }
487}
488
489impl WriterPropertiesBuilder {
490 /// Finalizes the configuration and returns immutable writer properties struct.
491 pub fn build(self) -> WriterProperties {
492 WriterProperties {
493 data_page_size_limit: self.data_page_size_limit,
494 data_page_row_count_limit: self.data_page_row_count_limit,
495 write_batch_size: self.write_batch_size,
496 max_row_group_size: self.max_row_group_size,
497 bloom_filter_position: self.bloom_filter_position,
498 writer_version: self.writer_version,
499 created_by: self.created_by,
500 offset_index_disabled: self.offset_index_disabled,
501 key_value_metadata: self.key_value_metadata,
502 default_column_properties: self.default_column_properties,
503 column_properties: self.column_properties,
504 sorting_columns: self.sorting_columns,
505 column_index_truncate_length: self.column_index_truncate_length,
506 statistics_truncate_length: self.statistics_truncate_length,
507 coerce_types: self.coerce_types,
508 #[cfg(feature = "encryption")]
509 file_encryption_properties: self.file_encryption_properties,
510 }
511 }
512
513 // ----------------------------------------------------------------------
514 // Writer properties related to a file
515
516 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
517 /// via [`DEFAULT_WRITER_VERSION`])
518 ///
519 /// This value can determine what features some readers will support.
520 ///
521 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
522 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
523 self.writer_version = value;
524 self
525 }
526
527 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
528 /// via [`DEFAULT_PAGE_SIZE`]).
529 ///
530 /// The parquet writer will attempt to limit the sizes of each
531 /// `DataPage` to this many bytes. Reducing this value will result
532 /// in larger parquet files, but may improve the effectiveness of
533 /// page index based predicate pushdown during reading.
534 ///
535 /// Note: this is a best effort limit based on value of
536 /// [`set_write_batch_size`](Self::set_write_batch_size).
537 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
538 self.data_page_size_limit = value;
539 self
540 }
541
542 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
543 /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
544 ///
545 /// The parquet writer will attempt to limit the number of rows in
546 /// each `DataPage` to this value. Reducing this value will result
547 /// in larger parquet files, but may improve the effectiveness of
548 /// page index based predicate pushdown during reading.
549 ///
550 /// Note: this is a best effort limit based on value of
551 /// [`set_write_batch_size`](Self::set_write_batch_size).
552 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
553 self.data_page_row_count_limit = value;
554 self
555 }
556
557 /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
558 ///
559 /// For performance reasons, data for each column is written in
560 /// batches of this size.
561 ///
562 /// Additional limits such as such as
563 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
564 /// are checked between batches, and thus the write batch size value acts as an
565 /// upper-bound on the enforcement granularity of other limits.
566 pub fn set_write_batch_size(mut self, value: usize) -> Self {
567 self.write_batch_size = value;
568 self
569 }
570
571 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
572 /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
573 ///
574 /// # Panics
575 /// If the value is set to 0.
576 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
577 assert!(value > 0, "Cannot have a 0 max row group size");
578 self.max_row_group_size = value;
579 self
580 }
581
582 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`]
583 /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
584 ///
585 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
586 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
587 self.bloom_filter_position = value;
588 self
589 }
590
591 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
592 /// [`DEFAULT_CREATED_BY`]).
593 ///
594 /// This is a string that will be written into the file metadata
595 pub fn set_created_by(mut self, value: String) -> Self {
596 self.created_by = value;
597 self
598 }
599
600 /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
601 /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
602 ///
603 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
604 ///
605 /// Note: As the offset indexes are useful for accessing data by row number,
606 /// they are always written by default, regardless of whether other statistics
607 /// are enabled. Disabling this metadata may result in a degradation in read
608 /// performance, so use this option with care.
609 ///
610 /// [`Page`]: EnabledStatistics::Page
611 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
612 self.offset_index_disabled = value;
613 self
614 }
615
616 /// Sets "key_value_metadata" property (defaults to `None`).
617 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
618 self.key_value_metadata = value;
619 self
620 }
621
622 /// Sets sorting order of rows in the row group if any (defaults to `None`).
623 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
624 self.sorting_columns = value;
625 self
626 }
627
628 /// Sets the max length of min/max value fields when writing the column
629 /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
630 ///
631 /// This can be used to prevent columns with very long values (hundreds of
632 /// bytes long) from causing the parquet metadata to become huge.
633 ///
634 /// # Notes
635 ///
636 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
637 /// set to [`EnabledStatistics::Page`].
638 ///
639 /// * If `Some`, must be greater than 0, otherwise will panic
640 /// * If `None`, there's no effective limit.
641 ///
642 /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
643 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
644 if let Some(value) = max_length {
645 assert!(
646 value > 0,
647 "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."
648 );
649 }
650
651 self.column_index_truncate_length = max_length;
652 self
653 }
654
655 /// Sets the max length of min/max value fields in row group and data page header
656 /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
657 ///
658 /// # Notes
659 /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
660 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
661 /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
662 /// [`EnabledStatistics::Page`].
663 ///
664 /// * If `Some`, must be greater than 0, otherwise will panic
665 /// * If `None`, there's no effective limit.
666 ///
667 /// # See also
668 /// Truncation of Page Index statistics is controlled separately via
669 /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
670 ///
671 /// [`Statistics`]: crate::file::statistics::Statistics
672 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
673 if let Some(value) = max_length {
674 assert!(
675 value > 0,
676 "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."
677 );
678 }
679
680 self.statistics_truncate_length = max_length;
681 self
682 }
683
684 /// Should the writer coerce types to parquet native types (defaults to `false` via
685 /// [`DEFAULT_COERCE_TYPES`]).
686 ///
687 /// Leaving this option the default `false` will ensure the exact same data
688 /// written to parquet using this library will be read.
689 ///
690 /// Setting this option to `true` will result in parquet files that can be
691 /// read by more readers, but potentially lose information in the process.
692 ///
693 /// * Types such as [`DataType::Date64`], which have no direct corresponding
694 /// Parquet type, may be stored with lower precision.
695 ///
696 /// * The internal field names of `List` and `Map` types will be renamed if
697 /// necessary to match what is required by the newest Parquet specification.
698 ///
699 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
700 ///
701 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
702 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
703 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
704 self.coerce_types = coerce_types;
705 self
706 }
707
708 /// Sets FileEncryptionProperties (defaults to `None`)
709 #[cfg(feature = "encryption")]
710 pub fn with_file_encryption_properties(
711 mut self,
712 file_encryption_properties: Arc<FileEncryptionProperties>,
713 ) -> Self {
714 self.file_encryption_properties = Some(file_encryption_properties);
715 self
716 }
717
718 // ----------------------------------------------------------------------
719 // Setters for any column (global)
720
721 /// Sets default encoding for all columns.
722 ///
723 /// If dictionary is not enabled, this is treated as a primary encoding for all
724 /// columns. In case when dictionary is enabled for any column, this value is
725 /// considered to be a fallback encoding for that column.
726 ///
727 /// # Panics
728 ///
729 /// if dictionary encoding is specified, regardless of dictionary
730 /// encoding flag being set.
731 pub fn set_encoding(mut self, value: Encoding) -> Self {
732 self.default_column_properties.set_encoding(value);
733 self
734 }
735
736 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
737 /// [`DEFAULT_COMPRESSION`]).
738 ///
739 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
740 pub fn set_compression(mut self, value: Compression) -> Self {
741 self.default_column_properties.set_compression(value);
742 self
743 }
744
745 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
746 /// via [`DEFAULT_DICTIONARY_ENABLED`]).
747 ///
748 /// Use this method to set dictionary encoding, instead of explicitly specifying
749 /// encoding in `set_encoding` method.
750 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
751 self.default_column_properties.set_dictionary_enabled(value);
752 self
753 }
754
755 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
756 /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
757 ///
758 /// The parquet writer will attempt to limit the size of each
759 /// `DataPage` used to store dictionaries to this many
760 /// bytes. Reducing this value will result in larger parquet
761 /// files, but may improve the effectiveness of page index based
762 /// predicate pushdown during reading.
763 ///
764 /// Note: this is a best effort limit based on value of
765 /// [`set_write_batch_size`](Self::set_write_batch_size).
766 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
767 self.default_column_properties
768 .set_dictionary_page_size_limit(value);
769 self
770 }
771
772 /// Sets default [`EnabledStatistics`] level for all columns (defaults to [`Page`] via
773 /// [`DEFAULT_STATISTICS_ENABLED`]).
774 ///
775 /// [`Page`]: EnabledStatistics::Page
776 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
777 self.default_column_properties.set_statistics_enabled(value);
778 self
779 }
780
781 /// enable/disable writing [`Statistics`] in the page header
782 /// (defaults to `false` via [`DEFAULT_WRITE_PAGE_HEADER_STATISTICS`]).
783 ///
784 /// Only applicable if [`Page`] level statistics are gathered.
785 ///
786 /// Setting this value to `true` can greatly increase the size of the resulting Parquet
787 /// file while yielding very little added benefit. Most modern Parquet implementations
788 /// will use the min/max values stored in the [`ParquetColumnIndex`] rather than
789 /// those in the page header.
790 ///
791 /// # Note
792 ///
793 /// Prior to version 56.0.0, the `parquet` crate always wrote these
794 /// statistics (the equivalent of setting this option to `true`). This was
795 /// changed in 56.0.0 to follow the recommendation in the Parquet
796 /// specification. See [issue #7580] for more details.
797 ///
798 /// [`Statistics`]: crate::file::statistics::Statistics
799 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
800 /// [`Page`]: EnabledStatistics::Page
801 /// [issue #7580]: https://github.com/apache/arrow-rs/issues/7580
802 pub fn set_write_page_header_statistics(mut self, value: bool) -> Self {
803 self.default_column_properties
804 .set_write_page_header_statistics(value);
805 self
806 }
807
808 /// Sets if bloom filter should be written for all columns (defaults to `false`).
809 ///
810 /// # Notes
811 ///
812 /// * If the bloom filter is enabled previously then it is a no-op.
813 ///
814 /// * If the bloom filter is not enabled, default values for ndv and fpp
815 /// value are used used. See [`set_bloom_filter_ndv`] and
816 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
817 ///
818 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
819 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
820 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
821 self.default_column_properties
822 .set_bloom_filter_enabled(value);
823 self
824 }
825
826 /// Sets the default target bloom filter false positive probability (fpp)
827 /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
828 ///
829 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
830 /// been called.
831 ///
832 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
833 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
834 self.default_column_properties.set_bloom_filter_fpp(value);
835 self
836 }
837
838 /// Sets default number of distinct values (ndv) for bloom filter for all
839 /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
840 ///
841 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
842 /// been called.
843 ///
844 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
845 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
846 self.default_column_properties.set_bloom_filter_ndv(value);
847 self
848 }
849
850 // ----------------------------------------------------------------------
851 // Setters for a specific column
852
853 /// Helper method to get existing or new mutable reference of column properties.
854 #[inline]
855 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
856 self.column_properties.entry(col).or_default()
857 }
858
859 /// Sets encoding for a specific column.
860 ///
861 /// Takes precedence over [`Self::set_encoding`].
862 ///
863 /// If dictionary is not enabled, this is treated as a primary encoding for this
864 /// column. In case when dictionary is enabled for this column, either through
865 /// global defaults or explicitly, this value is considered to be a fallback
866 /// encoding for this column.
867 ///
868 /// # Panics
869 /// If user tries to set dictionary encoding here, regardless of dictionary
870 /// encoding flag being set.
871 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
872 self.get_mut_props(col).set_encoding(value);
873 self
874 }
875
876 /// Sets compression codec for a specific column.
877 ///
878 /// Takes precedence over [`Self::set_compression`].
879 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
880 self.get_mut_props(col).set_compression(value);
881 self
882 }
883
884 /// Sets flag to enable/disable dictionary encoding for a specific column.
885 ///
886 /// Takes precedence over [`Self::set_dictionary_enabled`].
887 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
888 self.get_mut_props(col).set_dictionary_enabled(value);
889 self
890 }
891
892 /// Sets dictionary page size limit for a specific column.
893 ///
894 /// Takes precedence over [`Self::set_dictionary_page_size_limit`].
895 pub fn set_column_dictionary_page_size_limit(mut self, col: ColumnPath, value: usize) -> Self {
896 self.get_mut_props(col)
897 .set_dictionary_page_size_limit(value);
898 self
899 }
900
901 /// Sets [`EnabledStatistics`] level for a specific column.
902 ///
903 /// Takes precedence over [`Self::set_statistics_enabled`].
904 pub fn set_column_statistics_enabled(
905 mut self,
906 col: ColumnPath,
907 value: EnabledStatistics,
908 ) -> Self {
909 self.get_mut_props(col).set_statistics_enabled(value);
910 self
911 }
912
913 /// Sets whether to write [`Statistics`] in the page header for a specific column.
914 ///
915 /// Takes precedence over [`Self::set_write_page_header_statistics`].
916 ///
917 /// [`Statistics`]: crate::file::statistics::Statistics
918 pub fn set_column_write_page_header_statistics(mut self, col: ColumnPath, value: bool) -> Self {
919 self.get_mut_props(col)
920 .set_write_page_header_statistics(value);
921 self
922 }
923
924 /// Sets whether a bloom filter should be written for a specific column.
925 ///
926 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
927 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
928 self.get_mut_props(col).set_bloom_filter_enabled(value);
929 self
930 }
931
932 /// Sets the false positive probability for bloom filter for a specific column.
933 ///
934 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
935 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
936 self.get_mut_props(col).set_bloom_filter_fpp(value);
937 self
938 }
939
940 /// Sets the number of distinct values for bloom filter for a specific column.
941 ///
942 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
943 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
944 self.get_mut_props(col).set_bloom_filter_ndv(value);
945 self
946 }
947}
948
949impl From<WriterProperties> for WriterPropertiesBuilder {
950 fn from(props: WriterProperties) -> Self {
951 WriterPropertiesBuilder {
952 data_page_size_limit: props.data_page_size_limit,
953 data_page_row_count_limit: props.data_page_row_count_limit,
954 write_batch_size: props.write_batch_size,
955 max_row_group_size: props.max_row_group_size,
956 bloom_filter_position: props.bloom_filter_position,
957 writer_version: props.writer_version,
958 created_by: props.created_by,
959 offset_index_disabled: props.offset_index_disabled,
960 key_value_metadata: props.key_value_metadata,
961 default_column_properties: props.default_column_properties,
962 column_properties: props.column_properties,
963 sorting_columns: props.sorting_columns,
964 column_index_truncate_length: props.column_index_truncate_length,
965 statistics_truncate_length: props.statistics_truncate_length,
966 coerce_types: props.coerce_types,
967 #[cfg(feature = "encryption")]
968 file_encryption_properties: props.file_encryption_properties,
969 }
970 }
971}
972
973/// Controls the level of statistics to be computed by the writer and stored in
974/// the parquet file.
975///
976/// Enabling statistics makes the resulting Parquet file larger and requires
977/// more time to read the parquet footer.
978///
979/// Statistics can be used to improve query performance by pruning row groups
980/// and pages during query execution if the query engine supports evaluating the
981/// predicate using the statistics.
982#[derive(Debug, Clone, Copy, Eq, PartialEq)]
983pub enum EnabledStatistics {
984 /// Compute no statistics.
985 None,
986 /// Compute column chunk-level statistics but not page-level.
987 ///
988 /// Setting this option will store one set of statistics for each relevant
989 /// column for each row group. The more row groups written, the more
990 /// statistics will be stored.
991 Chunk,
992 /// Compute page-level and column chunk-level statistics.
993 ///
994 /// Setting this option will store one set of statistics for each relevant
995 /// column for each row group. In addition, this will enable the writing
996 /// of the column index (the offset index is always written regardless of
997 /// this setting). See [`ParquetColumnIndex`] for
998 /// more information.
999 ///
1000 /// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
1001 Page,
1002}
1003
1004impl FromStr for EnabledStatistics {
1005 type Err = String;
1006
1007 fn from_str(s: &str) -> Result<Self, Self::Err> {
1008 match s {
1009 "NONE" | "none" => Ok(EnabledStatistics::None),
1010 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
1011 "PAGE" | "page" => Ok(EnabledStatistics::Page),
1012 _ => Err(format!("Invalid statistics arg: {s}")),
1013 }
1014 }
1015}
1016
1017impl Default for EnabledStatistics {
1018 fn default() -> Self {
1019 DEFAULT_STATISTICS_ENABLED
1020 }
1021}
1022
1023/// Controls the bloom filter to be computed by the writer.
1024#[derive(Debug, Clone, PartialEq)]
1025pub struct BloomFilterProperties {
1026 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
1027 ///
1028 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
1029 ///
1030 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
1031 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1032 /// e.g. 0.1, 0.05, or 0.001 is recommended.
1033 ///
1034 /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
1035 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
1036 /// be known in advance to greatly reduce space usage.
1037 pub fpp: f64,
1038 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
1039 ///
1040 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
1041 ///
1042 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1043 /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1044 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
1045 /// anyway.
1046 ///
1047 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
1048 pub ndv: u64,
1049}
1050
1051impl Default for BloomFilterProperties {
1052 fn default() -> Self {
1053 BloomFilterProperties {
1054 fpp: DEFAULT_BLOOM_FILTER_FPP,
1055 ndv: DEFAULT_BLOOM_FILTER_NDV,
1056 }
1057 }
1058}
1059
1060/// Container for column properties that can be changed as part of writer.
1061///
1062/// If a field is `None`, it means that no specific value has been set for this column,
1063/// so some subsequent or default value must be used.
1064#[derive(Debug, Clone, Default, PartialEq)]
1065struct ColumnProperties {
1066 encoding: Option<Encoding>,
1067 codec: Option<Compression>,
1068 dictionary_page_size_limit: Option<usize>,
1069 dictionary_enabled: Option<bool>,
1070 statistics_enabled: Option<EnabledStatistics>,
1071 write_page_header_statistics: Option<bool>,
1072 /// bloom filter related properties
1073 bloom_filter_properties: Option<BloomFilterProperties>,
1074}
1075
1076impl ColumnProperties {
1077 /// Sets encoding for this column.
1078 ///
1079 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1080 /// In case when dictionary is enabled for a column, this value is considered to
1081 /// be a fallback encoding.
1082 ///
1083 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1084 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1085 /// for a column.
1086 fn set_encoding(&mut self, value: Encoding) {
1087 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1088 panic!("Dictionary encoding can not be used as fallback encoding");
1089 }
1090 self.encoding = Some(value);
1091 }
1092
1093 /// Sets compression codec for this column.
1094 fn set_compression(&mut self, value: Compression) {
1095 self.codec = Some(value);
1096 }
1097
1098 /// Sets whether dictionary encoding is enabled for this column.
1099 fn set_dictionary_enabled(&mut self, enabled: bool) {
1100 self.dictionary_enabled = Some(enabled);
1101 }
1102
1103 /// Sets dictionary page size limit for this column.
1104 fn set_dictionary_page_size_limit(&mut self, value: usize) {
1105 self.dictionary_page_size_limit = Some(value);
1106 }
1107
1108 /// Sets the statistics level for this column.
1109 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1110 self.statistics_enabled = Some(enabled);
1111 }
1112
1113 /// Sets whether to write statistics in the page header for this column.
1114 fn set_write_page_header_statistics(&mut self, enabled: bool) {
1115 self.write_page_header_statistics = Some(enabled);
1116 }
1117
1118 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1119 /// otherwise it is a no-op.
1120 /// If `value` is `false`, resets bloom filter properties to `None`.
1121 fn set_bloom_filter_enabled(&mut self, value: bool) {
1122 if value && self.bloom_filter_properties.is_none() {
1123 self.bloom_filter_properties = Some(Default::default())
1124 } else if !value {
1125 self.bloom_filter_properties = None
1126 }
1127 }
1128
1129 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1130 /// bloom filter if not previously enabled.
1131 ///
1132 /// # Panics
1133 ///
1134 /// Panics if the `value` is not between 0 and 1 exclusive
1135 fn set_bloom_filter_fpp(&mut self, value: f64) {
1136 assert!(
1137 value > 0. && value < 1.0,
1138 "fpp must be between 0 and 1 exclusive, got {value}"
1139 );
1140
1141 self.bloom_filter_properties
1142 .get_or_insert_with(Default::default)
1143 .fpp = value;
1144 }
1145
1146 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1147 /// enables bloom filter if not previously enabled.
1148 fn set_bloom_filter_ndv(&mut self, value: u64) {
1149 self.bloom_filter_properties
1150 .get_or_insert_with(Default::default)
1151 .ndv = value;
1152 }
1153
1154 /// Returns optional encoding for this column.
1155 fn encoding(&self) -> Option<Encoding> {
1156 self.encoding
1157 }
1158
1159 /// Returns optional compression codec for this column.
1160 fn compression(&self) -> Option<Compression> {
1161 self.codec
1162 }
1163
1164 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1165 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1166 /// been provided.
1167 fn dictionary_enabled(&self) -> Option<bool> {
1168 self.dictionary_enabled
1169 }
1170
1171 /// Returns optional dictionary page size limit for this column.
1172 fn dictionary_page_size_limit(&self) -> Option<usize> {
1173 self.dictionary_page_size_limit
1174 }
1175
1176 /// Returns optional statistics level requested for this column. If result is `None`,
1177 /// then no setting has been provided.
1178 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1179 self.statistics_enabled
1180 }
1181
1182 /// Returns `Some(true)` if [`Statistics`] are to be written to the page header for this
1183 /// column.
1184 ///
1185 /// [`Statistics`]: crate::file::statistics::Statistics
1186 fn write_page_header_statistics(&self) -> Option<bool> {
1187 self.write_page_header_statistics
1188 }
1189
1190 /// Returns the bloom filter properties, or `None` if not enabled
1191 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1192 self.bloom_filter_properties.as_ref()
1193 }
1194}
1195
1196/// Reference counted reader properties.
1197pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1198
1199const DEFAULT_READ_BLOOM_FILTER: bool = false;
1200const DEFAULT_READ_PAGE_STATS: bool = false;
1201
1202/// Configuration settings for reading parquet files.
1203///
1204/// All properties are immutable and `Send` + `Sync`.
1205/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1206///
1207/// # Example
1208///
1209/// ```rust
1210/// use parquet::file::properties::ReaderProperties;
1211///
1212/// // Create properties with default configuration.
1213/// let props = ReaderProperties::builder().build();
1214///
1215/// // Use properties builder to set certain options and assemble the configuration.
1216/// let props = ReaderProperties::builder()
1217/// .set_backward_compatible_lz4(false)
1218/// .build();
1219/// ```
1220pub struct ReaderProperties {
1221 codec_options: CodecOptions,
1222 read_bloom_filter: bool,
1223 read_page_stats: bool,
1224}
1225
1226impl ReaderProperties {
1227 /// Returns builder for reader properties with default values.
1228 pub fn builder() -> ReaderPropertiesBuilder {
1229 ReaderPropertiesBuilder::with_defaults()
1230 }
1231
1232 /// Returns codec options.
1233 pub(crate) fn codec_options(&self) -> &CodecOptions {
1234 &self.codec_options
1235 }
1236
1237 /// Returns whether to read bloom filter
1238 pub(crate) fn read_bloom_filter(&self) -> bool {
1239 self.read_bloom_filter
1240 }
1241
1242 /// Returns whether to read page level statistics
1243 pub(crate) fn read_page_stats(&self) -> bool {
1244 self.read_page_stats
1245 }
1246}
1247
1248/// Builder for parquet file reader configuration. See example on
1249/// [`ReaderProperties`]
1250pub struct ReaderPropertiesBuilder {
1251 codec_options_builder: CodecOptionsBuilder,
1252 read_bloom_filter: Option<bool>,
1253 read_page_stats: Option<bool>,
1254}
1255
1256/// Reader properties builder.
1257impl ReaderPropertiesBuilder {
1258 /// Returns default state of the builder.
1259 fn with_defaults() -> Self {
1260 Self {
1261 codec_options_builder: CodecOptionsBuilder::default(),
1262 read_bloom_filter: None,
1263 read_page_stats: None,
1264 }
1265 }
1266
1267 /// Finalizes the configuration and returns immutable reader properties struct.
1268 pub fn build(self) -> ReaderProperties {
1269 ReaderProperties {
1270 codec_options: self.codec_options_builder.build(),
1271 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1272 read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS),
1273 }
1274 }
1275
1276 /// Enable/disable backward compatible LZ4.
1277 ///
1278 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1279 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1280 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1281 /// compatibility with files generated by older versions of parquet-cpp.
1282 ///
1283 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1284 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1285 self.codec_options_builder = self
1286 .codec_options_builder
1287 .set_backward_compatible_lz4(value);
1288 self
1289 }
1290
1291 /// Enable/disable reading bloom filter
1292 ///
1293 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1294 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1295 ///
1296 /// By default bloom filter is set to be read.
1297 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1298 self.read_bloom_filter = Some(value);
1299 self
1300 }
1301
1302 /// Enable/disable reading page-level statistics
1303 ///
1304 /// If set to `true`, then the reader will decode and populate the [`Statistics`] for
1305 /// each page, if present.
1306 /// If set to `false`, then the reader will skip decoding the statistics.
1307 ///
1308 /// By default statistics will not be decoded.
1309 ///
1310 /// [`Statistics`]: crate::file::statistics::Statistics
1311 pub fn set_read_page_statistics(mut self, value: bool) -> Self {
1312 self.read_page_stats = Some(value);
1313 self
1314 }
1315}
1316
1317#[cfg(test)]
1318mod tests {
1319 use super::*;
1320
1321 #[test]
1322 fn test_writer_version() {
1323 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1324 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1325 }
1326
1327 #[test]
1328 fn test_writer_properties_default_settings() {
1329 let props = WriterProperties::default();
1330 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1331 assert_eq!(
1332 props.dictionary_page_size_limit(),
1333 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1334 );
1335 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1336 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1337 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1338 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1339 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1340 assert_eq!(props.key_value_metadata(), None);
1341 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1342 assert_eq!(
1343 props.compression(&ColumnPath::from("col")),
1344 DEFAULT_COMPRESSION
1345 );
1346 assert_eq!(
1347 props.dictionary_enabled(&ColumnPath::from("col")),
1348 DEFAULT_DICTIONARY_ENABLED
1349 );
1350 assert_eq!(
1351 props.statistics_enabled(&ColumnPath::from("col")),
1352 DEFAULT_STATISTICS_ENABLED
1353 );
1354 assert!(
1355 props
1356 .bloom_filter_properties(&ColumnPath::from("col"))
1357 .is_none()
1358 );
1359 }
1360
1361 #[test]
1362 fn test_writer_properties_dictionary_encoding() {
1363 // dictionary encoding is not configurable, and it should be the same for both
1364 // writer version 1 and 2.
1365 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1366 let props = WriterProperties::builder()
1367 .set_writer_version(*version)
1368 .build();
1369 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1370 assert_eq!(
1371 props.dictionary_data_page_encoding(),
1372 Encoding::RLE_DICTIONARY
1373 );
1374 }
1375 }
1376
1377 #[test]
1378 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1379 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1380 // Should panic when user specifies dictionary encoding as fallback encoding.
1381 WriterProperties::builder()
1382 .set_encoding(Encoding::PLAIN_DICTIONARY)
1383 .build();
1384 }
1385
1386 #[test]
1387 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1388 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1389 // Should panic when user specifies dictionary encoding as fallback encoding.
1390 WriterProperties::builder()
1391 .set_encoding(Encoding::RLE_DICTIONARY)
1392 .build();
1393 }
1394
1395 #[test]
1396 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1397 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1398 WriterProperties::builder()
1399 .set_dictionary_enabled(true)
1400 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1401 .build();
1402 }
1403
1404 #[test]
1405 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1406 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1407 WriterProperties::builder()
1408 .set_dictionary_enabled(false)
1409 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1410 .build();
1411 }
1412
1413 #[test]
1414 fn test_writer_properties_builder() {
1415 let props = WriterProperties::builder()
1416 // file settings
1417 .set_writer_version(WriterVersion::PARQUET_2_0)
1418 .set_data_page_size_limit(10)
1419 .set_dictionary_page_size_limit(20)
1420 .set_write_batch_size(30)
1421 .set_max_row_group_size(40)
1422 .set_created_by("default".to_owned())
1423 .set_key_value_metadata(Some(vec![KeyValue::new(
1424 "key".to_string(),
1425 "value".to_string(),
1426 )]))
1427 // global column settings
1428 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1429 .set_compression(Compression::GZIP(Default::default()))
1430 .set_dictionary_enabled(false)
1431 .set_statistics_enabled(EnabledStatistics::None)
1432 // specific column settings
1433 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1434 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1435 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1436 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1437 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1438 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1439 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1440 .build();
1441
1442 fn test_props(props: &WriterProperties) {
1443 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1444 assert_eq!(props.data_page_size_limit(), 10);
1445 assert_eq!(props.dictionary_page_size_limit(), 20);
1446 assert_eq!(props.write_batch_size(), 30);
1447 assert_eq!(props.max_row_group_size(), 40);
1448 assert_eq!(props.created_by(), "default");
1449 assert_eq!(
1450 props.key_value_metadata(),
1451 Some(&vec![
1452 KeyValue::new("key".to_string(), "value".to_string(),)
1453 ])
1454 );
1455
1456 assert_eq!(
1457 props.encoding(&ColumnPath::from("a")),
1458 Some(Encoding::DELTA_BINARY_PACKED)
1459 );
1460 assert_eq!(
1461 props.compression(&ColumnPath::from("a")),
1462 Compression::GZIP(Default::default())
1463 );
1464 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1465 assert_eq!(
1466 props.statistics_enabled(&ColumnPath::from("a")),
1467 EnabledStatistics::None
1468 );
1469
1470 assert_eq!(
1471 props.encoding(&ColumnPath::from("col")),
1472 Some(Encoding::RLE)
1473 );
1474 assert_eq!(
1475 props.compression(&ColumnPath::from("col")),
1476 Compression::SNAPPY
1477 );
1478 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1479 assert_eq!(
1480 props.statistics_enabled(&ColumnPath::from("col")),
1481 EnabledStatistics::Chunk
1482 );
1483 assert_eq!(
1484 props.bloom_filter_properties(&ColumnPath::from("col")),
1485 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1486 );
1487 }
1488
1489 // Test direct build of properties
1490 test_props(&props);
1491
1492 // Test that into_builder() gives the same result
1493 let props_into_builder_and_back = props.into_builder().build();
1494 test_props(&props_into_builder_and_back);
1495 }
1496
1497 #[test]
1498 fn test_writer_properties_builder_partial_defaults() {
1499 let props = WriterProperties::builder()
1500 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1501 .set_compression(Compression::GZIP(Default::default()))
1502 .set_bloom_filter_enabled(true)
1503 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1504 .build();
1505
1506 assert_eq!(
1507 props.encoding(&ColumnPath::from("col")),
1508 Some(Encoding::RLE)
1509 );
1510 assert_eq!(
1511 props.compression(&ColumnPath::from("col")),
1512 Compression::GZIP(Default::default())
1513 );
1514 assert_eq!(
1515 props.dictionary_enabled(&ColumnPath::from("col")),
1516 DEFAULT_DICTIONARY_ENABLED
1517 );
1518 assert_eq!(
1519 props.bloom_filter_properties(&ColumnPath::from("col")),
1520 Some(&BloomFilterProperties {
1521 fpp: 0.05,
1522 ndv: 1_000_000_u64
1523 })
1524 );
1525 }
1526
1527 #[test]
1528 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1529 assert_eq!(
1530 WriterProperties::builder()
1531 .build()
1532 .bloom_filter_properties(&ColumnPath::from("col")),
1533 None
1534 );
1535 assert_eq!(
1536 WriterProperties::builder()
1537 .set_bloom_filter_ndv(100)
1538 .build()
1539 .bloom_filter_properties(&ColumnPath::from("col")),
1540 Some(&BloomFilterProperties {
1541 fpp: 0.05,
1542 ndv: 100
1543 })
1544 );
1545 assert_eq!(
1546 WriterProperties::builder()
1547 .set_bloom_filter_fpp(0.1)
1548 .build()
1549 .bloom_filter_properties(&ColumnPath::from("col")),
1550 Some(&BloomFilterProperties {
1551 fpp: 0.1,
1552 ndv: 1_000_000_u64
1553 })
1554 );
1555 }
1556
1557 #[test]
1558 fn test_writer_properties_column_dictionary_page_size_limit() {
1559 let props = WriterProperties::builder()
1560 .set_dictionary_page_size_limit(100)
1561 .set_column_dictionary_page_size_limit(ColumnPath::from("col"), 10)
1562 .build();
1563
1564 assert_eq!(props.dictionary_page_size_limit(), 100);
1565 assert_eq!(
1566 props.column_dictionary_page_size_limit(&ColumnPath::from("col")),
1567 10
1568 );
1569 assert_eq!(
1570 props.column_dictionary_page_size_limit(&ColumnPath::from("other")),
1571 100
1572 );
1573 }
1574
1575 #[test]
1576 fn test_reader_properties_default_settings() {
1577 let props = ReaderProperties::builder().build();
1578
1579 let codec_options = CodecOptionsBuilder::default()
1580 .set_backward_compatible_lz4(true)
1581 .build();
1582
1583 assert_eq!(props.codec_options(), &codec_options);
1584 assert!(!props.read_bloom_filter());
1585 }
1586
1587 #[test]
1588 fn test_reader_properties_builder() {
1589 let props = ReaderProperties::builder()
1590 .set_backward_compatible_lz4(false)
1591 .build();
1592
1593 let codec_options = CodecOptionsBuilder::default()
1594 .set_backward_compatible_lz4(false)
1595 .build();
1596
1597 assert_eq!(props.codec_options(), &codec_options);
1598 }
1599
1600 #[test]
1601 fn test_parse_writerversion() {
1602 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1603 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1604 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1605 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1606
1607 // test lowercase
1608 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1609 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1610
1611 // test invalid version
1612 match "PARQUET_-1_0".parse::<WriterVersion>() {
1613 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1614 Err(e) => {
1615 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1616 }
1617 }
1618 }
1619
1620 #[test]
1621 fn test_parse_enabledstatistics() {
1622 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1623 assert_eq!(enabled_statistics, EnabledStatistics::None);
1624 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1625 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1626 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1627 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1628
1629 // test lowercase
1630 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1631 assert_eq!(enabled_statistics, EnabledStatistics::None);
1632
1633 //test invalid statistics
1634 match "ChunkAndPage".parse::<EnabledStatistics>() {
1635 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1636 Err(e) => {
1637 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1638 }
1639 }
1640 }
1641}