parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::max_statistics_size`]
46#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
47pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
48/// Default value for [`WriterProperties::max_row_group_size`]
49pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
50/// Default value for [`WriterProperties::bloom_filter_position`]
51pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
52/// Default value for [`WriterProperties::created_by`]
53pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
54/// Default value for [`WriterProperties::column_index_truncate_length`]
55pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
56/// Default value for [`BloomFilterProperties::fpp`]
57pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
58/// Default value for [`BloomFilterProperties::ndv`]
59pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
60/// Default values for [`WriterProperties::statistics_truncate_length`]
61pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
62/// Default value for [`WriterProperties::offset_index_disabled`]
63pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
64/// Default values for [`WriterProperties::coerce_types`]
65pub const DEFAULT_COERCE_TYPES: bool = false;
66
67/// Parquet writer version.
68///
69/// Basic constant, which is not part of the Thrift definition.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71#[allow(non_camel_case_types)]
72pub enum WriterVersion {
73 /// Parquet format version 1.0
74 PARQUET_1_0,
75 /// Parquet format version 2.0
76 PARQUET_2_0,
77}
78
79impl WriterVersion {
80 /// Returns writer version as `i32`.
81 pub fn as_num(&self) -> i32 {
82 match self {
83 WriterVersion::PARQUET_1_0 => 1,
84 WriterVersion::PARQUET_2_0 => 2,
85 }
86 }
87}
88
89impl FromStr for WriterVersion {
90 type Err = String;
91
92 fn from_str(s: &str) -> Result<Self, Self::Err> {
93 match s {
94 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
95 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
96 _ => Err(format!("Invalid writer version: {}", s)),
97 }
98 }
99}
100
101/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
102/// write Bloom filters
103///
104/// Basic constant, which is not part of the Thrift definition.
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum BloomFilterPosition {
107 /// Write Bloom Filters of each row group right after the row group
108 ///
109 /// This saves memory by writing it as soon as it is computed, at the cost
110 /// of data locality for readers
111 AfterRowGroup,
112 /// Write Bloom Filters at the end of the file
113 ///
114 /// This allows better data locality for readers, at the cost of memory usage
115 /// for writers.
116 End,
117}
118
119/// Reference counted writer properties.
120pub type WriterPropertiesPtr = Arc<WriterProperties>;
121
122/// Configuration settings for writing parquet files.
123///
124/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
125///
126/// # Example
127///
128/// ```rust
129/// # use parquet::{
130/// # basic::{Compression, Encoding},
131/// # file::properties::*,
132/// # schema::types::ColumnPath,
133/// # };
134/// #
135/// // Create properties with default configuration.
136/// let props = WriterProperties::default();
137///
138/// // Use properties builder to set certain options and assemble the configuration.
139/// let props = WriterProperties::builder()
140/// .set_writer_version(WriterVersion::PARQUET_1_0)
141/// .set_encoding(Encoding::PLAIN)
142/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
143/// .set_compression(Compression::SNAPPY)
144/// .build();
145///
146/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
147/// assert_eq!(
148/// props.encoding(&ColumnPath::from("col1")),
149/// Some(Encoding::DELTA_BINARY_PACKED)
150/// );
151/// assert_eq!(
152/// props.encoding(&ColumnPath::from("col2")),
153/// Some(Encoding::PLAIN)
154/// );
155/// ```
156#[derive(Debug, Clone)]
157pub struct WriterProperties {
158 data_page_size_limit: usize,
159 dictionary_page_size_limit: usize,
160 data_page_row_count_limit: usize,
161 write_batch_size: usize,
162 max_row_group_size: usize,
163 bloom_filter_position: BloomFilterPosition,
164 writer_version: WriterVersion,
165 created_by: String,
166 offset_index_disabled: bool,
167 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
168 default_column_properties: ColumnProperties,
169 column_properties: HashMap<ColumnPath, ColumnProperties>,
170 sorting_columns: Option<Vec<SortingColumn>>,
171 column_index_truncate_length: Option<usize>,
172 statistics_truncate_length: Option<usize>,
173 coerce_types: bool,
174 #[cfg(feature = "encryption")]
175 pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
176}
177
178impl Default for WriterProperties {
179 fn default() -> Self {
180 Self::builder().build()
181 }
182}
183
184impl WriterProperties {
185 /// Create a new [`WriterProperties`] with the default settings
186 ///
187 /// See [`WriterProperties::builder`] for customising settings
188 pub fn new() -> Self {
189 Self::default()
190 }
191
192 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
193 /// properties.
194 pub fn builder() -> WriterPropertiesBuilder {
195 WriterPropertiesBuilder::with_defaults()
196 }
197
198 /// Returns data page size limit.
199 ///
200 /// Note: this is a best effort limit based on the write batch size
201 ///
202 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
203 pub fn data_page_size_limit(&self) -> usize {
204 self.data_page_size_limit
205 }
206
207 /// Returns dictionary page size limit.
208 ///
209 /// Note: this is a best effort limit based on the write batch size
210 ///
211 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
212 pub fn dictionary_page_size_limit(&self) -> usize {
213 self.dictionary_page_size_limit
214 }
215
216 /// Returns the maximum page row count
217 ///
218 /// Note: this is a best effort limit based on the write batch size
219 ///
220 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
221 pub fn data_page_row_count_limit(&self) -> usize {
222 self.data_page_row_count_limit
223 }
224
225 /// Returns configured batch size for writes.
226 ///
227 /// When writing a batch of data, this setting allows to split it internally into
228 /// smaller batches so we can better estimate the size of a page currently being
229 /// written.
230 ///
231 /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
232 pub fn write_batch_size(&self) -> usize {
233 self.write_batch_size
234 }
235
236 /// Returns maximum number of rows in a row group.
237 ///
238 /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
239 pub fn max_row_group_size(&self) -> usize {
240 self.max_row_group_size
241 }
242
243 /// Returns bloom filter position.
244 ///
245 /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
246 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
247 self.bloom_filter_position
248 }
249
250 /// Returns configured writer version.
251 ///
252 /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
253 pub fn writer_version(&self) -> WriterVersion {
254 self.writer_version
255 }
256
257 /// Returns `created_by` string.
258 ///
259 /// For more details see [`WriterPropertiesBuilder::set_created_by`]
260 pub fn created_by(&self) -> &str {
261 &self.created_by
262 }
263
264 /// Returns `true` if offset index writing is disabled.
265 ///
266 /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
267 pub fn offset_index_disabled(&self) -> bool {
268 // If page statistics are to be collected, then do not disable the offset indexes.
269 let default_page_stats_enabled =
270 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
271 let column_page_stats_enabled = self
272 .column_properties
273 .iter()
274 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
275 if default_page_stats_enabled || column_page_stats_enabled {
276 return false;
277 }
278
279 self.offset_index_disabled
280 }
281
282 /// Returns `key_value_metadata` KeyValue pairs.
283 ///
284 /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
285 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
286 self.key_value_metadata.as_ref()
287 }
288
289 /// Returns sorting columns.
290 ///
291 /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
292 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
293 self.sorting_columns.as_ref()
294 }
295
296 /// Returns the maximum length of truncated min/max values in the column index.
297 ///
298 /// `None` if truncation is disabled, must be greater than 0 otherwise.
299 ///
300 /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
301 pub fn column_index_truncate_length(&self) -> Option<usize> {
302 self.column_index_truncate_length
303 }
304
305 /// Returns the maximum length of truncated min/max values in statistics.
306 ///
307 /// `None` if truncation is disabled, must be greater than 0 otherwise.
308 ///
309 /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
310 pub fn statistics_truncate_length(&self) -> Option<usize> {
311 self.statistics_truncate_length
312 }
313
314 /// Returns `true` if type coercion is enabled.
315 ///
316 /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
317 pub fn coerce_types(&self) -> bool {
318 self.coerce_types
319 }
320
321 /// Returns encoding for a data page, when dictionary encoding is enabled.
322 ///
323 /// This is not configurable.
324 #[inline]
325 pub fn dictionary_data_page_encoding(&self) -> Encoding {
326 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
327 // Dictionary values are encoded using RLE_DICTIONARY encoding.
328 Encoding::RLE_DICTIONARY
329 }
330
331 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
332 ///
333 /// This is not configurable.
334 #[inline]
335 pub fn dictionary_page_encoding(&self) -> Encoding {
336 // PLAIN_DICTIONARY is deprecated in writer version 1.
337 // Dictionary is encoded using plain encoding.
338 Encoding::PLAIN
339 }
340
341 /// Returns encoding for a column, if set.
342 ///
343 /// In case when dictionary is enabled, returns fallback encoding.
344 ///
345 /// If encoding is not set, then column writer will choose the best encoding
346 /// based on the column type.
347 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
348 self.column_properties
349 .get(col)
350 .and_then(|c| c.encoding())
351 .or_else(|| self.default_column_properties.encoding())
352 }
353
354 /// Returns compression codec for a column.
355 ///
356 /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
357 pub fn compression(&self, col: &ColumnPath) -> Compression {
358 self.column_properties
359 .get(col)
360 .and_then(|c| c.compression())
361 .or_else(|| self.default_column_properties.compression())
362 .unwrap_or(DEFAULT_COMPRESSION)
363 }
364
365 /// Returns `true` if dictionary encoding is enabled for a column.
366 ///
367 /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
368 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
369 self.column_properties
370 .get(col)
371 .and_then(|c| c.dictionary_enabled())
372 .or_else(|| self.default_column_properties.dictionary_enabled())
373 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
374 }
375
376 /// Returns which statistics are written for a column.
377 ///
378 /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
379 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
380 self.column_properties
381 .get(col)
382 .and_then(|c| c.statistics_enabled())
383 .or_else(|| self.default_column_properties.statistics_enabled())
384 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
385 }
386
387 /// Returns max size for statistics.
388 ///
389 /// UNUSED
390 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
391 pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
392 #[allow(deprecated)]
393 self.column_properties
394 .get(col)
395 .and_then(|c| c.max_statistics_size())
396 .or_else(|| self.default_column_properties.max_statistics_size())
397 .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
398 }
399
400 /// Returns the [`BloomFilterProperties`] for the given column
401 ///
402 /// Returns `None` if bloom filter is disabled
403 ///
404 /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
405 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
406 self.column_properties
407 .get(col)
408 .and_then(|c| c.bloom_filter_properties())
409 .or_else(|| self.default_column_properties.bloom_filter_properties())
410 }
411
412 /// Return file encryption properties
413 ///
414 /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
415 #[cfg(feature = "encryption")]
416 pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
417 self.file_encryption_properties.as_ref()
418 }
419}
420
421/// Builder for [`WriterProperties`] Parquet writer configuration.
422///
423/// See example on [`WriterProperties`]
424pub struct WriterPropertiesBuilder {
425 data_page_size_limit: usize,
426 dictionary_page_size_limit: usize,
427 data_page_row_count_limit: usize,
428 write_batch_size: usize,
429 max_row_group_size: usize,
430 bloom_filter_position: BloomFilterPosition,
431 writer_version: WriterVersion,
432 created_by: String,
433 offset_index_disabled: bool,
434 key_value_metadata: Option<Vec<KeyValue>>,
435 default_column_properties: ColumnProperties,
436 column_properties: HashMap<ColumnPath, ColumnProperties>,
437 sorting_columns: Option<Vec<SortingColumn>>,
438 column_index_truncate_length: Option<usize>,
439 statistics_truncate_length: Option<usize>,
440 coerce_types: bool,
441 #[cfg(feature = "encryption")]
442 file_encryption_properties: Option<FileEncryptionProperties>,
443}
444
445impl WriterPropertiesBuilder {
446 /// Returns default state of the builder.
447 fn with_defaults() -> Self {
448 Self {
449 data_page_size_limit: DEFAULT_PAGE_SIZE,
450 dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
451 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
452 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
453 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
454 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
455 writer_version: DEFAULT_WRITER_VERSION,
456 created_by: DEFAULT_CREATED_BY.to_string(),
457 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
458 key_value_metadata: None,
459 default_column_properties: Default::default(),
460 column_properties: HashMap::new(),
461 sorting_columns: None,
462 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
463 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
464 coerce_types: DEFAULT_COERCE_TYPES,
465 #[cfg(feature = "encryption")]
466 file_encryption_properties: None,
467 }
468 }
469
470 /// Finalizes the configuration and returns immutable writer properties struct.
471 pub fn build(self) -> WriterProperties {
472 WriterProperties {
473 data_page_size_limit: self.data_page_size_limit,
474 dictionary_page_size_limit: self.dictionary_page_size_limit,
475 data_page_row_count_limit: self.data_page_row_count_limit,
476 write_batch_size: self.write_batch_size,
477 max_row_group_size: self.max_row_group_size,
478 bloom_filter_position: self.bloom_filter_position,
479 writer_version: self.writer_version,
480 created_by: self.created_by,
481 offset_index_disabled: self.offset_index_disabled,
482 key_value_metadata: self.key_value_metadata,
483 default_column_properties: self.default_column_properties,
484 column_properties: self.column_properties,
485 sorting_columns: self.sorting_columns,
486 column_index_truncate_length: self.column_index_truncate_length,
487 statistics_truncate_length: self.statistics_truncate_length,
488 coerce_types: self.coerce_types,
489 #[cfg(feature = "encryption")]
490 file_encryption_properties: self.file_encryption_properties,
491 }
492 }
493
494 // ----------------------------------------------------------------------
495 // Writer properties related to a file
496
497 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
498 ///
499 /// This value can determine what features some readers will support.
500 ///
501 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
502 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
503 self.writer_version = value;
504 self
505 }
506
507 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
508 ///
509 /// The parquet writer will attempt to limit the sizes of each
510 /// `DataPage` to this many bytes. Reducing this value will result
511 /// in larger parquet files, but may improve the effectiveness of
512 /// page index based predicate pushdown during reading.
513 ///
514 /// Note: this is a best effort limit based on value of
515 /// [`set_write_batch_size`](Self::set_write_batch_size).
516 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
517 self.data_page_size_limit = value;
518 self
519 }
520
521 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
522 ///
523 /// The parquet writer will attempt to limit the number of rows in
524 /// each `DataPage` to this value. Reducing this value will result
525 /// in larger parquet files, but may improve the effectiveness of
526 /// page index based predicate pushdown during reading.
527 ///
528 /// Note: this is a best effort limit based on value of
529 /// [`set_write_batch_size`](Self::set_write_batch_size).
530 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
531 self.data_page_row_count_limit = value;
532 self
533 }
534
535 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
536 ///
537 /// The parquet writer will attempt to limit the size of each
538 /// `DataPage` used to store dictionaries to this many
539 /// bytes. Reducing this value will result in larger parquet
540 /// files, but may improve the effectiveness of page index based
541 /// predicate pushdown during reading.
542 ///
543 /// Note: this is a best effort limit based on value of
544 /// [`set_write_batch_size`](Self::set_write_batch_size).
545 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
546 self.dictionary_page_size_limit = value;
547 self
548 }
549
550 /// Sets write batch size (defaults to 1024).
551 ///
552 /// For performance reasons, data for each column is written in
553 /// batches of this size.
554 ///
555 /// Additional limits such as such as
556 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
557 /// are checked between batches, and thus the write batch size value acts as an
558 /// upper-bound on the enforcement granularity of other limits.
559 pub fn set_write_batch_size(mut self, value: usize) -> Self {
560 self.write_batch_size = value;
561 self
562 }
563
564 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
565 ///
566 /// # Panics
567 /// If the value is set to 0.
568 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
569 assert!(value > 0, "Cannot have a 0 max row group size");
570 self.max_row_group_size = value;
571 self
572 }
573
574 /// Sets where in the final file Bloom Filters are written (defaults to [`AfterRowGroup`])
575 ///
576 /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
577 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
578 self.bloom_filter_position = value;
579 self
580 }
581
582 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
583 ///
584 /// This is a string that will be written into the file metadata
585 pub fn set_created_by(mut self, value: String) -> Self {
586 self.created_by = value;
587 self
588 }
589
590 /// Sets whether the writing of offset indexes is disabled (defaults to `false`).
591 ///
592 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
593 ///
594 /// Note: As the offset indexes are useful for accessing data by row number,
595 /// they are always written by default, regardless of whether other statistics
596 /// are enabled. Disabling this metadata may result in a degradation in read
597 /// performance, so use this option with care.
598 ///
599 /// [`Page`]: EnabledStatistics::Page
600 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
601 self.offset_index_disabled = value;
602 self
603 }
604
605 /// Sets "key_value_metadata" property (defaults to `None`).
606 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
607 self.key_value_metadata = value;
608 self
609 }
610
611 /// Sets sorting order of rows in the row group if any (defaults to `None`).
612 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
613 self.sorting_columns = value;
614 self
615 }
616
617 // ----------------------------------------------------------------------
618 // Setters for any column (global)
619
620 /// Sets default encoding for all columns.
621 ///
622 /// If dictionary is not enabled, this is treated as a primary encoding for all
623 /// columns. In case when dictionary is enabled for any column, this value is
624 /// considered to be a fallback encoding for that column.
625 ///
626 /// # Panics
627 ///
628 /// if dictionary encoding is specified, regardless of dictionary
629 /// encoding flag being set.
630 pub fn set_encoding(mut self, value: Encoding) -> Self {
631 self.default_column_properties.set_encoding(value);
632 self
633 }
634
635 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
636 ///
637 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
638 pub fn set_compression(mut self, value: Compression) -> Self {
639 self.default_column_properties.set_compression(value);
640 self
641 }
642
643 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
644 ///
645 /// Use this method to set dictionary encoding, instead of explicitly specifying
646 /// encoding in `set_encoding` method.
647 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
648 self.default_column_properties.set_dictionary_enabled(value);
649 self
650 }
651
652 /// Sets default statistics level for all columns (defaults to [`Page`]).
653 ///
654 /// [`Page`]: EnabledStatistics::Page
655 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
656 self.default_column_properties.set_statistics_enabled(value);
657 self
658 }
659
660 /// Sets default max statistics size for all columns (defaults to `4096`).
661 ///
662 /// Applicable only if statistics are enabled.
663 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
664 pub fn set_max_statistics_size(mut self, value: usize) -> Self {
665 #[allow(deprecated)]
666 self.default_column_properties
667 .set_max_statistics_size(value);
668 self
669 }
670
671 /// Sets if bloom filter should be written for all columns (defaults to `false`).
672 ///
673 /// # Notes
674 ///
675 /// * If the bloom filter is enabled previously then it is a no-op.
676 ///
677 /// * If the bloom filter is not enabled, default values for ndv and fpp
678 /// value are used used. See [`set_bloom_filter_ndv`] and
679 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
680 ///
681 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
682 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
683 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
684 self.default_column_properties
685 .set_bloom_filter_enabled(value);
686 self
687 }
688
689 /// Sets the default target bloom filter false positive probability (fpp)
690 /// for all columns (defaults to `0.05`).
691 ///
692 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
693 /// been called.
694 ///
695 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
696 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
697 self.default_column_properties.set_bloom_filter_fpp(value);
698 self
699 }
700
701 /// Sets default number of distinct values (ndv) for bloom filter for all
702 /// columns (defaults to `1_000_000`).
703 ///
704 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
705 /// been called.
706 ///
707 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
708 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
709 self.default_column_properties.set_bloom_filter_ndv(value);
710 self
711 }
712
713 // ----------------------------------------------------------------------
714 // Setters for a specific column
715
716 /// Helper method to get existing or new mutable reference of column properties.
717 #[inline]
718 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
719 self.column_properties.entry(col).or_default()
720 }
721
722 /// Sets encoding for a specific column.
723 ///
724 /// Takes precedence over [`Self::set_encoding`].
725 ///
726 /// If dictionary is not enabled, this is treated as a primary encoding for this
727 /// column. In case when dictionary is enabled for this column, either through
728 /// global defaults or explicitly, this value is considered to be a fallback
729 /// encoding for this column.
730 ///
731 /// # Panics
732 /// If user tries to set dictionary encoding here, regardless of dictionary
733 /// encoding flag being set.
734 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
735 self.get_mut_props(col).set_encoding(value);
736 self
737 }
738
739 /// Sets compression codec for a specific column.
740 ///
741 /// Takes precedence over [`Self::set_compression`].
742 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
743 self.get_mut_props(col).set_compression(value);
744 self
745 }
746
747 /// Sets flag to enable/disable dictionary encoding for a specific column.
748 ///
749 /// Takes precedence over [`Self::set_dictionary_enabled`].
750 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
751 self.get_mut_props(col).set_dictionary_enabled(value);
752 self
753 }
754
755 /// Sets statistics level for a specific column
756 ///
757 /// Takes precedence over [`Self::set_statistics_enabled`].
758 pub fn set_column_statistics_enabled(
759 mut self,
760 col: ColumnPath,
761 value: EnabledStatistics,
762 ) -> Self {
763 self.get_mut_props(col).set_statistics_enabled(value);
764 self
765 }
766
767 /// Sets max size for statistics for a specific column.
768 ///
769 /// Takes precedence over [`Self::set_max_statistics_size`].
770 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
771 pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
772 #[allow(deprecated)]
773 self.get_mut_props(col).set_max_statistics_size(value);
774 self
775 }
776
777 /// Sets whether a bloom filter should be written for a specific column.
778 ///
779 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
780 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
781 self.get_mut_props(col).set_bloom_filter_enabled(value);
782 self
783 }
784
785 /// Sets the false positive probability for bloom filter for a specific column.
786 ///
787 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
788 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
789 self.get_mut_props(col).set_bloom_filter_fpp(value);
790 self
791 }
792
793 /// Sets the number of distinct values for bloom filter for a specific column.
794 ///
795 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
796 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
797 self.get_mut_props(col).set_bloom_filter_ndv(value);
798 self
799 }
800
801 /// Sets the max length of min/max value fields when writing the column
802 /// [`Index`] (defaults to `None` (no limit)).
803 ///
804 /// This can be used to prevent columns with very long values (hundreds of
805 /// bytes long) from causing the parquet metadata to become huge.
806 ///
807 /// # Notes
808 ///
809 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
810 /// set to [`EnabledStatistics::Page`].
811 ///
812 /// * If `Some`, must be greater than 0, otherwise will panic
813 /// * If `None`, there's no effective limit.
814 ///
815 /// [`Index`]: crate::file::page_index::index::Index
816 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
817 if let Some(value) = max_length {
818 assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
819 }
820
821 self.column_index_truncate_length = max_length;
822 self
823 }
824
825 /// Sets the max length of min/max value fields in row group level
826 /// [`Statistics`] (defaults to `None` (no limit)).
827 ///
828 /// # Notes
829 /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
830 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
831 ///
832 /// * If `Some`, must be greater than 0, otherwise will panic
833 /// * If `None`, there's no effective limit.
834 ///
835 /// [`Statistics`]: crate::file::statistics::Statistics
836 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
837 if let Some(value) = max_length {
838 assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
839 }
840
841 self.statistics_truncate_length = max_length;
842 self
843 }
844
845 /// Should the writer coerce types to parquet native types (defaults to `false`).
846 ///
847 /// Leaving this option the default `false` will ensure the exact same data
848 /// written to parquet using this library will be read.
849 ///
850 /// Setting this option to `true` will result in parquet files that can be
851 /// read by more readers, but potentially lose information in the process.
852 ///
853 /// * Types such as [`DataType::Date64`], which have no direct corresponding
854 /// Parquet type, may be stored with lower precision.
855 ///
856 /// * The internal field names of `List` and `Map` types will be renamed if
857 /// necessary to match what is required by the newest Parquet specification.
858 ///
859 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
860 ///
861 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
862 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
863 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
864 self.coerce_types = coerce_types;
865 self
866 }
867
868 /// Sets FileEncryptionProperties (defaults to `None`)
869 #[cfg(feature = "encryption")]
870 pub fn with_file_encryption_properties(
871 mut self,
872 file_encryption_properties: FileEncryptionProperties,
873 ) -> Self {
874 self.file_encryption_properties = Some(file_encryption_properties);
875 self
876 }
877}
878
879/// Controls the level of statistics to be computed by the writer and stored in
880/// the parquet file.
881///
882/// Enabling statistics makes the resulting Parquet file larger and requires
883/// more time to read the parquet footer.
884///
885/// Statistics can be used to improve query performance by pruning row groups
886/// and pages during query execution if the query engine supports evaluating the
887/// predicate using the statistics.
888#[derive(Debug, Clone, Copy, Eq, PartialEq)]
889pub enum EnabledStatistics {
890 /// Compute no statistics.
891 None,
892 /// Compute column chunk-level statistics but not page-level.
893 ///
894 /// Setting this option will store one set of statistics for each relevant
895 /// column for each row group. The more row groups written, the more
896 /// statistics will be stored.
897 Chunk,
898 /// Compute page-level and column chunk-level statistics.
899 ///
900 /// Setting this option will store one set of statistics for each relevant
901 /// column for each page and row group. The more row groups and the more
902 /// pages written, the more statistics will be stored.
903 Page,
904}
905
906impl FromStr for EnabledStatistics {
907 type Err = String;
908
909 fn from_str(s: &str) -> Result<Self, Self::Err> {
910 match s {
911 "NONE" | "none" => Ok(EnabledStatistics::None),
912 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
913 "PAGE" | "page" => Ok(EnabledStatistics::Page),
914 _ => Err(format!("Invalid statistics arg: {}", s)),
915 }
916 }
917}
918
919impl Default for EnabledStatistics {
920 fn default() -> Self {
921 DEFAULT_STATISTICS_ENABLED
922 }
923}
924
925/// Controls the bloom filter to be computed by the writer.
926#[derive(Debug, Clone, PartialEq)]
927pub struct BloomFilterProperties {
928 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
929 ///
930 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
931 ///
932 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
933 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
934 /// e.g. 0.1, 0.05, or 0.001 is recommended.
935 ///
936 /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
937 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
938 /// be known in advance to greatly reduce space usage.
939 pub fpp: f64,
940 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
941 ///
942 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
943 ///
944 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
945 /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
946 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
947 /// anyway.
948 ///
949 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
950 pub ndv: u64,
951}
952
953impl Default for BloomFilterProperties {
954 fn default() -> Self {
955 BloomFilterProperties {
956 fpp: DEFAULT_BLOOM_FILTER_FPP,
957 ndv: DEFAULT_BLOOM_FILTER_NDV,
958 }
959 }
960}
961
962/// Container for column properties that can be changed as part of writer.
963///
964/// If a field is `None`, it means that no specific value has been set for this column,
965/// so some subsequent or default value must be used.
966#[derive(Debug, Clone, Default, PartialEq)]
967struct ColumnProperties {
968 encoding: Option<Encoding>,
969 codec: Option<Compression>,
970 dictionary_enabled: Option<bool>,
971 statistics_enabled: Option<EnabledStatistics>,
972 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
973 max_statistics_size: Option<usize>,
974 /// bloom filter related properties
975 bloom_filter_properties: Option<BloomFilterProperties>,
976}
977
978impl ColumnProperties {
979 /// Sets encoding for this column.
980 ///
981 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
982 /// In case when dictionary is enabled for a column, this value is considered to
983 /// be a fallback encoding.
984 ///
985 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
986 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
987 /// for a column.
988 fn set_encoding(&mut self, value: Encoding) {
989 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
990 panic!("Dictionary encoding can not be used as fallback encoding");
991 }
992 self.encoding = Some(value);
993 }
994
995 /// Sets compression codec for this column.
996 fn set_compression(&mut self, value: Compression) {
997 self.codec = Some(value);
998 }
999
1000 /// Sets whether dictionary encoding is enabled for this column.
1001 fn set_dictionary_enabled(&mut self, enabled: bool) {
1002 self.dictionary_enabled = Some(enabled);
1003 }
1004
1005 /// Sets the statistics level for this column.
1006 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1007 self.statistics_enabled = Some(enabled);
1008 }
1009
1010 /// Sets max size for statistics for this column.
1011 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1012 #[allow(deprecated)]
1013 fn set_max_statistics_size(&mut self, value: usize) {
1014 self.max_statistics_size = Some(value);
1015 }
1016
1017 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1018 /// otherwise it is a no-op.
1019 /// If `value` is `false`, resets bloom filter properties to `None`.
1020 fn set_bloom_filter_enabled(&mut self, value: bool) {
1021 if value && self.bloom_filter_properties.is_none() {
1022 self.bloom_filter_properties = Some(Default::default())
1023 } else if !value {
1024 self.bloom_filter_properties = None
1025 }
1026 }
1027
1028 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1029 /// bloom filter if not previously enabled.
1030 ///
1031 /// # Panics
1032 ///
1033 /// Panics if the `value` is not between 0 and 1 exclusive
1034 fn set_bloom_filter_fpp(&mut self, value: f64) {
1035 assert!(
1036 value > 0. && value < 1.0,
1037 "fpp must be between 0 and 1 exclusive, got {value}"
1038 );
1039
1040 self.bloom_filter_properties
1041 .get_or_insert_with(Default::default)
1042 .fpp = value;
1043 }
1044
1045 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1046 /// enables bloom filter if not previously enabled.
1047 fn set_bloom_filter_ndv(&mut self, value: u64) {
1048 self.bloom_filter_properties
1049 .get_or_insert_with(Default::default)
1050 .ndv = value;
1051 }
1052
1053 /// Returns optional encoding for this column.
1054 fn encoding(&self) -> Option<Encoding> {
1055 self.encoding
1056 }
1057
1058 /// Returns optional compression codec for this column.
1059 fn compression(&self) -> Option<Compression> {
1060 self.codec
1061 }
1062
1063 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1064 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1065 /// been provided.
1066 fn dictionary_enabled(&self) -> Option<bool> {
1067 self.dictionary_enabled
1068 }
1069
1070 /// Returns optional statistics level requested for this column. If result is `None`,
1071 /// then no setting has been provided.
1072 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1073 self.statistics_enabled
1074 }
1075
1076 /// Returns optional max size in bytes for statistics.
1077 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1078 fn max_statistics_size(&self) -> Option<usize> {
1079 #[allow(deprecated)]
1080 self.max_statistics_size
1081 }
1082
1083 /// Returns the bloom filter properties, or `None` if not enabled
1084 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1085 self.bloom_filter_properties.as_ref()
1086 }
1087}
1088
1089/// Reference counted reader properties.
1090pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1091
1092const DEFAULT_READ_BLOOM_FILTER: bool = false;
1093
1094/// Configuration settings for reading parquet files.
1095///
1096/// All properties are immutable and `Send` + `Sync`.
1097/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1098///
1099/// # Example
1100///
1101/// ```rust
1102/// use parquet::file::properties::ReaderProperties;
1103///
1104/// // Create properties with default configuration.
1105/// let props = ReaderProperties::builder().build();
1106///
1107/// // Use properties builder to set certain options and assemble the configuration.
1108/// let props = ReaderProperties::builder()
1109/// .set_backward_compatible_lz4(false)
1110/// .build();
1111/// ```
1112pub struct ReaderProperties {
1113 codec_options: CodecOptions,
1114 read_bloom_filter: bool,
1115}
1116
1117impl ReaderProperties {
1118 /// Returns builder for reader properties with default values.
1119 pub fn builder() -> ReaderPropertiesBuilder {
1120 ReaderPropertiesBuilder::with_defaults()
1121 }
1122
1123 /// Returns codec options.
1124 pub(crate) fn codec_options(&self) -> &CodecOptions {
1125 &self.codec_options
1126 }
1127
1128 /// Returns whether to read bloom filter
1129 pub(crate) fn read_bloom_filter(&self) -> bool {
1130 self.read_bloom_filter
1131 }
1132}
1133
1134/// Builder for parquet file reader configuration. See example on
1135/// [`ReaderProperties`]
1136pub struct ReaderPropertiesBuilder {
1137 codec_options_builder: CodecOptionsBuilder,
1138 read_bloom_filter: Option<bool>,
1139}
1140
1141/// Reader properties builder.
1142impl ReaderPropertiesBuilder {
1143 /// Returns default state of the builder.
1144 fn with_defaults() -> Self {
1145 Self {
1146 codec_options_builder: CodecOptionsBuilder::default(),
1147 read_bloom_filter: None,
1148 }
1149 }
1150
1151 /// Finalizes the configuration and returns immutable reader properties struct.
1152 pub fn build(self) -> ReaderProperties {
1153 ReaderProperties {
1154 codec_options: self.codec_options_builder.build(),
1155 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1156 }
1157 }
1158
1159 /// Enable/disable backward compatible LZ4.
1160 ///
1161 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1162 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1163 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1164 /// compatibility with files generated by older versions of parquet-cpp.
1165 ///
1166 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1167 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1168 self.codec_options_builder = self
1169 .codec_options_builder
1170 .set_backward_compatible_lz4(value);
1171 self
1172 }
1173
1174 /// Enable/disable reading bloom filter
1175 ///
1176 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1177 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1178 ///
1179 /// By default bloom filter is set to be read.
1180 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1181 self.read_bloom_filter = Some(value);
1182 self
1183 }
1184}
1185
1186#[cfg(test)]
1187mod tests {
1188 use super::*;
1189
1190 #[test]
1191 fn test_writer_version() {
1192 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1193 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1194 }
1195
1196 #[test]
1197 fn test_writer_properties_default_settings() {
1198 let props = WriterProperties::default();
1199 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1200 assert_eq!(
1201 props.dictionary_page_size_limit(),
1202 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1203 );
1204 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1205 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1206 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1207 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1208 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1209 assert_eq!(props.key_value_metadata(), None);
1210 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1211 assert_eq!(
1212 props.compression(&ColumnPath::from("col")),
1213 DEFAULT_COMPRESSION
1214 );
1215 assert_eq!(
1216 props.dictionary_enabled(&ColumnPath::from("col")),
1217 DEFAULT_DICTIONARY_ENABLED
1218 );
1219 assert_eq!(
1220 props.statistics_enabled(&ColumnPath::from("col")),
1221 DEFAULT_STATISTICS_ENABLED
1222 );
1223 assert!(props
1224 .bloom_filter_properties(&ColumnPath::from("col"))
1225 .is_none());
1226 }
1227
1228 #[test]
1229 fn test_writer_properties_dictionary_encoding() {
1230 // dictionary encoding is not configurable, and it should be the same for both
1231 // writer version 1 and 2.
1232 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1233 let props = WriterProperties::builder()
1234 .set_writer_version(*version)
1235 .build();
1236 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1237 assert_eq!(
1238 props.dictionary_data_page_encoding(),
1239 Encoding::RLE_DICTIONARY
1240 );
1241 }
1242 }
1243
1244 #[test]
1245 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1246 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1247 // Should panic when user specifies dictionary encoding as fallback encoding.
1248 WriterProperties::builder()
1249 .set_encoding(Encoding::PLAIN_DICTIONARY)
1250 .build();
1251 }
1252
1253 #[test]
1254 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1255 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1256 // Should panic when user specifies dictionary encoding as fallback encoding.
1257 WriterProperties::builder()
1258 .set_encoding(Encoding::RLE_DICTIONARY)
1259 .build();
1260 }
1261
1262 #[test]
1263 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1264 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1265 WriterProperties::builder()
1266 .set_dictionary_enabled(true)
1267 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1268 .build();
1269 }
1270
1271 #[test]
1272 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1273 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1274 WriterProperties::builder()
1275 .set_dictionary_enabled(false)
1276 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1277 .build();
1278 }
1279
1280 #[test]
1281 fn test_writer_properties_builder() {
1282 let props = WriterProperties::builder()
1283 // file settings
1284 .set_writer_version(WriterVersion::PARQUET_2_0)
1285 .set_data_page_size_limit(10)
1286 .set_dictionary_page_size_limit(20)
1287 .set_write_batch_size(30)
1288 .set_max_row_group_size(40)
1289 .set_created_by("default".to_owned())
1290 .set_key_value_metadata(Some(vec![KeyValue::new(
1291 "key".to_string(),
1292 "value".to_string(),
1293 )]))
1294 // global column settings
1295 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1296 .set_compression(Compression::GZIP(Default::default()))
1297 .set_dictionary_enabled(false)
1298 .set_statistics_enabled(EnabledStatistics::None)
1299 // specific column settings
1300 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1301 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1302 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1303 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1304 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1305 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1306 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1307 .build();
1308
1309 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1310 assert_eq!(props.data_page_size_limit(), 10);
1311 assert_eq!(props.dictionary_page_size_limit(), 20);
1312 assert_eq!(props.write_batch_size(), 30);
1313 assert_eq!(props.max_row_group_size(), 40);
1314 assert_eq!(props.created_by(), "default");
1315 assert_eq!(
1316 props.key_value_metadata(),
1317 Some(&vec![
1318 KeyValue::new("key".to_string(), "value".to_string(),)
1319 ])
1320 );
1321
1322 assert_eq!(
1323 props.encoding(&ColumnPath::from("a")),
1324 Some(Encoding::DELTA_BINARY_PACKED)
1325 );
1326 assert_eq!(
1327 props.compression(&ColumnPath::from("a")),
1328 Compression::GZIP(Default::default())
1329 );
1330 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1331 assert_eq!(
1332 props.statistics_enabled(&ColumnPath::from("a")),
1333 EnabledStatistics::None
1334 );
1335
1336 assert_eq!(
1337 props.encoding(&ColumnPath::from("col")),
1338 Some(Encoding::RLE)
1339 );
1340 assert_eq!(
1341 props.compression(&ColumnPath::from("col")),
1342 Compression::SNAPPY
1343 );
1344 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1345 assert_eq!(
1346 props.statistics_enabled(&ColumnPath::from("col")),
1347 EnabledStatistics::Chunk
1348 );
1349 assert_eq!(
1350 props.bloom_filter_properties(&ColumnPath::from("col")),
1351 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1352 );
1353 }
1354
1355 #[test]
1356 fn test_writer_properties_builder_partial_defaults() {
1357 let props = WriterProperties::builder()
1358 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1359 .set_compression(Compression::GZIP(Default::default()))
1360 .set_bloom_filter_enabled(true)
1361 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1362 .build();
1363
1364 assert_eq!(
1365 props.encoding(&ColumnPath::from("col")),
1366 Some(Encoding::RLE)
1367 );
1368 assert_eq!(
1369 props.compression(&ColumnPath::from("col")),
1370 Compression::GZIP(Default::default())
1371 );
1372 assert_eq!(
1373 props.dictionary_enabled(&ColumnPath::from("col")),
1374 DEFAULT_DICTIONARY_ENABLED
1375 );
1376 assert_eq!(
1377 props.bloom_filter_properties(&ColumnPath::from("col")),
1378 Some(&BloomFilterProperties {
1379 fpp: 0.05,
1380 ndv: 1_000_000_u64
1381 })
1382 );
1383 }
1384
1385 #[test]
1386 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1387 assert_eq!(
1388 WriterProperties::builder()
1389 .build()
1390 .bloom_filter_properties(&ColumnPath::from("col")),
1391 None
1392 );
1393 assert_eq!(
1394 WriterProperties::builder()
1395 .set_bloom_filter_ndv(100)
1396 .build()
1397 .bloom_filter_properties(&ColumnPath::from("col")),
1398 Some(&BloomFilterProperties {
1399 fpp: 0.05,
1400 ndv: 100
1401 })
1402 );
1403 assert_eq!(
1404 WriterProperties::builder()
1405 .set_bloom_filter_fpp(0.1)
1406 .build()
1407 .bloom_filter_properties(&ColumnPath::from("col")),
1408 Some(&BloomFilterProperties {
1409 fpp: 0.1,
1410 ndv: 1_000_000_u64
1411 })
1412 );
1413 }
1414
1415 #[test]
1416 fn test_reader_properties_default_settings() {
1417 let props = ReaderProperties::builder().build();
1418
1419 let codec_options = CodecOptionsBuilder::default()
1420 .set_backward_compatible_lz4(true)
1421 .build();
1422
1423 assert_eq!(props.codec_options(), &codec_options);
1424 assert!(!props.read_bloom_filter());
1425 }
1426
1427 #[test]
1428 fn test_reader_properties_builder() {
1429 let props = ReaderProperties::builder()
1430 .set_backward_compatible_lz4(false)
1431 .build();
1432
1433 let codec_options = CodecOptionsBuilder::default()
1434 .set_backward_compatible_lz4(false)
1435 .build();
1436
1437 assert_eq!(props.codec_options(), &codec_options);
1438 }
1439
1440 #[test]
1441 fn test_parse_writerversion() {
1442 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1443 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1444 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1445 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1446
1447 // test lowercase
1448 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1449 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1450
1451 // test invalid version
1452 match "PARQUET_-1_0".parse::<WriterVersion>() {
1453 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1454 Err(e) => {
1455 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1456 }
1457 }
1458 }
1459
1460 #[test]
1461 fn test_parse_enabledstatistics() {
1462 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1463 assert_eq!(enabled_statistics, EnabledStatistics::None);
1464 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1465 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1466 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1467 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1468
1469 // test lowercase
1470 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1471 assert_eq!(enabled_statistics, EnabledStatistics::None);
1472
1473 //test invalid statistics
1474 match "ChunkAndPage".parse::<EnabledStatistics>() {
1475 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1476 Err(e) => {
1477 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1478 }
1479 }
1480 }
1481}