parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::max_statistics_size`]
46#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
47pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
48/// Default value for [`WriterProperties::max_row_group_size`]
49pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
50/// Default value for [`WriterProperties::bloom_filter_position`]
51pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
52/// Default value for [`WriterProperties::created_by`]
53pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
54/// Default value for [`WriterProperties::column_index_truncate_length`]
55pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
56/// Default value for [`BloomFilterProperties::fpp`]
57pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
58/// Default value for [`BloomFilterProperties::ndv`]
59pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
60/// Default values for [`WriterProperties::statistics_truncate_length`]
61pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
62/// Default value for [`WriterProperties::offset_index_disabled`]
63pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
64/// Default values for [`WriterProperties::coerce_types`]
65pub const DEFAULT_COERCE_TYPES: bool = false;
66
67/// Parquet writer version.
68///
69/// Basic constant, which is not part of the Thrift definition.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71#[allow(non_camel_case_types)]
72pub enum WriterVersion {
73 /// Parquet format version 1.0
74 PARQUET_1_0,
75 /// Parquet format version 2.0
76 PARQUET_2_0,
77}
78
79impl WriterVersion {
80 /// Returns writer version as `i32`.
81 pub fn as_num(&self) -> i32 {
82 match self {
83 WriterVersion::PARQUET_1_0 => 1,
84 WriterVersion::PARQUET_2_0 => 2,
85 }
86 }
87}
88
89impl FromStr for WriterVersion {
90 type Err = String;
91
92 fn from_str(s: &str) -> Result<Self, Self::Err> {
93 match s {
94 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
95 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
96 _ => Err(format!("Invalid writer version: {}", s)),
97 }
98 }
99}
100
101/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
102/// write Bloom filters
103///
104/// Basic constant, which is not part of the Thrift definition.
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum BloomFilterPosition {
107 /// Write Bloom Filters of each row group right after the row group
108 ///
109 /// This saves memory by writing it as soon as it is computed, at the cost
110 /// of data locality for readers
111 AfterRowGroup,
112 /// Write Bloom Filters at the end of the file
113 ///
114 /// This allows better data locality for readers, at the cost of memory usage
115 /// for writers.
116 End,
117}
118
119/// Reference counted writer properties.
120pub type WriterPropertiesPtr = Arc<WriterProperties>;
121
122/// Configuration settings for writing parquet files.
123///
124/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
125///
126/// # Example
127///
128/// ```rust
129/// # use parquet::{
130/// # basic::{Compression, Encoding},
131/// # file::properties::*,
132/// # schema::types::ColumnPath,
133/// # };
134/// #
135/// // Create properties with default configuration.
136/// let props = WriterProperties::default();
137///
138/// // Use properties builder to set certain options and assemble the configuration.
139/// let props = WriterProperties::builder()
140/// .set_writer_version(WriterVersion::PARQUET_1_0)
141/// .set_encoding(Encoding::PLAIN)
142/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
143/// .set_compression(Compression::SNAPPY)
144/// .build();
145///
146/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
147/// assert_eq!(
148/// props.encoding(&ColumnPath::from("col1")),
149/// Some(Encoding::DELTA_BINARY_PACKED)
150/// );
151/// assert_eq!(
152/// props.encoding(&ColumnPath::from("col2")),
153/// Some(Encoding::PLAIN)
154/// );
155/// ```
156#[derive(Debug, Clone)]
157pub struct WriterProperties {
158 data_page_size_limit: usize,
159 dictionary_page_size_limit: usize,
160 data_page_row_count_limit: usize,
161 write_batch_size: usize,
162 max_row_group_size: usize,
163 bloom_filter_position: BloomFilterPosition,
164 writer_version: WriterVersion,
165 created_by: String,
166 offset_index_disabled: bool,
167 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
168 default_column_properties: ColumnProperties,
169 column_properties: HashMap<ColumnPath, ColumnProperties>,
170 sorting_columns: Option<Vec<SortingColumn>>,
171 column_index_truncate_length: Option<usize>,
172 statistics_truncate_length: Option<usize>,
173 coerce_types: bool,
174 #[cfg(feature = "encryption")]
175 pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
176}
177
178impl Default for WriterProperties {
179 fn default() -> Self {
180 Self::builder().build()
181 }
182}
183
184impl WriterProperties {
185 /// Create a new [`WriterProperties`] with the default settings
186 ///
187 /// See [`WriterProperties::builder`] for customising settings
188 pub fn new() -> Self {
189 Self::default()
190 }
191
192 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
193 /// properties.
194 pub fn builder() -> WriterPropertiesBuilder {
195 WriterPropertiesBuilder::with_defaults()
196 }
197
198 /// Returns data page size limit.
199 ///
200 /// Note: this is a best effort limit based on the write batch size
201 ///
202 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
203 pub fn data_page_size_limit(&self) -> usize {
204 self.data_page_size_limit
205 }
206
207 /// Returns dictionary page size limit.
208 ///
209 /// Note: this is a best effort limit based on the write batch size
210 ///
211 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
212 pub fn dictionary_page_size_limit(&self) -> usize {
213 self.dictionary_page_size_limit
214 }
215
216 /// Returns the maximum page row count
217 ///
218 /// Note: this is a best effort limit based on the write batch size
219 ///
220 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
221 pub fn data_page_row_count_limit(&self) -> usize {
222 self.data_page_row_count_limit
223 }
224
225 /// Returns configured batch size for writes.
226 ///
227 /// When writing a batch of data, this setting allows to split it internally into
228 /// smaller batches so we can better estimate the size of a page currently being
229 /// written.
230 pub fn write_batch_size(&self) -> usize {
231 self.write_batch_size
232 }
233
234 /// Returns maximum number of rows in a row group.
235 pub fn max_row_group_size(&self) -> usize {
236 self.max_row_group_size
237 }
238
239 /// Returns bloom filter position.
240 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
241 self.bloom_filter_position
242 }
243
244 /// Returns configured writer version.
245 pub fn writer_version(&self) -> WriterVersion {
246 self.writer_version
247 }
248
249 /// Returns `created_by` string.
250 pub fn created_by(&self) -> &str {
251 &self.created_by
252 }
253
254 /// Returns `true` if offset index writing is disabled.
255 pub fn offset_index_disabled(&self) -> bool {
256 // If page statistics are to be collected, then do not disable the offset indexes.
257 let default_page_stats_enabled =
258 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
259 let column_page_stats_enabled = self
260 .column_properties
261 .iter()
262 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
263 if default_page_stats_enabled || column_page_stats_enabled {
264 return false;
265 }
266
267 self.offset_index_disabled
268 }
269
270 /// Returns `key_value_metadata` KeyValue pairs.
271 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
272 self.key_value_metadata.as_ref()
273 }
274
275 /// Returns sorting columns.
276 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
277 self.sorting_columns.as_ref()
278 }
279
280 /// Returns the maximum length of truncated min/max values in the column index.
281 ///
282 /// `None` if truncation is disabled, must be greater than 0 otherwise.
283 pub fn column_index_truncate_length(&self) -> Option<usize> {
284 self.column_index_truncate_length
285 }
286
287 /// Returns the maximum length of truncated min/max values in statistics.
288 ///
289 /// `None` if truncation is disabled, must be greater than 0 otherwise.
290 pub fn statistics_truncate_length(&self) -> Option<usize> {
291 self.statistics_truncate_length
292 }
293
294 /// Returns `true` if type coercion is enabled.
295 pub fn coerce_types(&self) -> bool {
296 self.coerce_types
297 }
298
299 /// Returns encoding for a data page, when dictionary encoding is enabled.
300 /// This is not configurable.
301 #[inline]
302 pub fn dictionary_data_page_encoding(&self) -> Encoding {
303 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
304 // Dictionary values are encoded using RLE_DICTIONARY encoding.
305 Encoding::RLE_DICTIONARY
306 }
307
308 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
309 /// This is not configurable.
310 #[inline]
311 pub fn dictionary_page_encoding(&self) -> Encoding {
312 // PLAIN_DICTIONARY is deprecated in writer version 1.
313 // Dictionary is encoded using plain encoding.
314 Encoding::PLAIN
315 }
316
317 /// Returns encoding for a column, if set.
318 /// In case when dictionary is enabled, returns fallback encoding.
319 ///
320 /// If encoding is not set, then column writer will choose the best encoding
321 /// based on the column type.
322 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
323 self.column_properties
324 .get(col)
325 .and_then(|c| c.encoding())
326 .or_else(|| self.default_column_properties.encoding())
327 }
328
329 /// Returns compression codec for a column.
330 pub fn compression(&self, col: &ColumnPath) -> Compression {
331 self.column_properties
332 .get(col)
333 .and_then(|c| c.compression())
334 .or_else(|| self.default_column_properties.compression())
335 .unwrap_or(DEFAULT_COMPRESSION)
336 }
337
338 /// Returns `true` if dictionary encoding is enabled for a column.
339 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
340 self.column_properties
341 .get(col)
342 .and_then(|c| c.dictionary_enabled())
343 .or_else(|| self.default_column_properties.dictionary_enabled())
344 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
345 }
346
347 /// Returns which statistics are written for a column.
348 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
349 self.column_properties
350 .get(col)
351 .and_then(|c| c.statistics_enabled())
352 .or_else(|| self.default_column_properties.statistics_enabled())
353 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
354 }
355
356 /// Returns max size for statistics.
357 /// Only applicable if statistics are enabled.
358 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
359 pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
360 #[allow(deprecated)]
361 self.column_properties
362 .get(col)
363 .and_then(|c| c.max_statistics_size())
364 .or_else(|| self.default_column_properties.max_statistics_size())
365 .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
366 }
367
368 /// Returns the [`BloomFilterProperties`] for the given column
369 ///
370 /// Returns `None` if bloom filter is disabled
371 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
372 self.column_properties
373 .get(col)
374 .and_then(|c| c.bloom_filter_properties())
375 .or_else(|| self.default_column_properties.bloom_filter_properties())
376 }
377
378 /// Return file encryption properties
379 #[cfg(feature = "encryption")]
380 pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
381 self.file_encryption_properties.as_ref()
382 }
383}
384
385/// Builder for [`WriterProperties`] parquet writer configuration.
386///
387/// See example on [`WriterProperties`]
388pub struct WriterPropertiesBuilder {
389 data_page_size_limit: usize,
390 dictionary_page_size_limit: usize,
391 data_page_row_count_limit: usize,
392 write_batch_size: usize,
393 max_row_group_size: usize,
394 bloom_filter_position: BloomFilterPosition,
395 writer_version: WriterVersion,
396 created_by: String,
397 offset_index_disabled: bool,
398 key_value_metadata: Option<Vec<KeyValue>>,
399 default_column_properties: ColumnProperties,
400 column_properties: HashMap<ColumnPath, ColumnProperties>,
401 sorting_columns: Option<Vec<SortingColumn>>,
402 column_index_truncate_length: Option<usize>,
403 statistics_truncate_length: Option<usize>,
404 coerce_types: bool,
405 #[cfg(feature = "encryption")]
406 file_encryption_properties: Option<FileEncryptionProperties>,
407}
408
409impl WriterPropertiesBuilder {
410 /// Returns default state of the builder.
411 fn with_defaults() -> Self {
412 Self {
413 data_page_size_limit: DEFAULT_PAGE_SIZE,
414 dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
415 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
416 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
417 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
418 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
419 writer_version: DEFAULT_WRITER_VERSION,
420 created_by: DEFAULT_CREATED_BY.to_string(),
421 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
422 key_value_metadata: None,
423 default_column_properties: Default::default(),
424 column_properties: HashMap::new(),
425 sorting_columns: None,
426 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
427 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
428 coerce_types: DEFAULT_COERCE_TYPES,
429 #[cfg(feature = "encryption")]
430 file_encryption_properties: None,
431 }
432 }
433
434 /// Finalizes the configuration and returns immutable writer properties struct.
435 pub fn build(self) -> WriterProperties {
436 WriterProperties {
437 data_page_size_limit: self.data_page_size_limit,
438 dictionary_page_size_limit: self.dictionary_page_size_limit,
439 data_page_row_count_limit: self.data_page_row_count_limit,
440 write_batch_size: self.write_batch_size,
441 max_row_group_size: self.max_row_group_size,
442 bloom_filter_position: self.bloom_filter_position,
443 writer_version: self.writer_version,
444 created_by: self.created_by,
445 offset_index_disabled: self.offset_index_disabled,
446 key_value_metadata: self.key_value_metadata,
447 default_column_properties: self.default_column_properties,
448 column_properties: self.column_properties,
449 sorting_columns: self.sorting_columns,
450 column_index_truncate_length: self.column_index_truncate_length,
451 statistics_truncate_length: self.statistics_truncate_length,
452 coerce_types: self.coerce_types,
453 #[cfg(feature = "encryption")]
454 file_encryption_properties: self.file_encryption_properties,
455 }
456 }
457
458 // ----------------------------------------------------------------------
459 // Writer properties related to a file
460
461 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
462 ///
463 /// This value can determine what features some readers will support.
464 ///
465 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
466 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
467 self.writer_version = value;
468 self
469 }
470
471 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
472 ///
473 /// The parquet writer will attempt to limit the sizes of each
474 /// `DataPage` to this many bytes. Reducing this value will result
475 /// in larger parquet files, but may improve the effectiveness of
476 /// page index based predicate pushdown during reading.
477 ///
478 /// Note: this is a best effort limit based on value of
479 /// [`set_write_batch_size`](Self::set_write_batch_size).
480 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
481 self.data_page_size_limit = value;
482 self
483 }
484
485 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
486 ///
487 /// The parquet writer will attempt to limit the number of rows in
488 /// each `DataPage` to this value. Reducing this value will result
489 /// in larger parquet files, but may improve the effectiveness of
490 /// page index based predicate pushdown during reading.
491 ///
492 /// Note: this is a best effort limit based on value of
493 /// [`set_write_batch_size`](Self::set_write_batch_size).
494 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
495 self.data_page_row_count_limit = value;
496 self
497 }
498
499 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
500 ///
501 /// The parquet writer will attempt to limit the size of each
502 /// `DataPage` used to store dictionaries to this many
503 /// bytes. Reducing this value will result in larger parquet
504 /// files, but may improve the effectiveness of page index based
505 /// predicate pushdown during reading.
506 ///
507 /// Note: this is a best effort limit based on value of
508 /// [`set_write_batch_size`](Self::set_write_batch_size).
509 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
510 self.dictionary_page_size_limit = value;
511 self
512 }
513
514 /// Sets write batch size (defaults to 1024).
515 ///
516 /// For performance reasons, data for each column is written in
517 /// batches of this size.
518 ///
519 /// Additional limits such as such as
520 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
521 /// are checked between batches, and thus the write batch size value acts as an
522 /// upper-bound on the enforcement granularity of other limits.
523 pub fn set_write_batch_size(mut self, value: usize) -> Self {
524 self.write_batch_size = value;
525 self
526 }
527
528 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
529 ///
530 /// # Panics
531 /// If the value is set to 0.
532 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
533 assert!(value > 0, "Cannot have a 0 max row group size");
534 self.max_row_group_size = value;
535 self
536 }
537
538 /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
539 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
540 self.bloom_filter_position = value;
541 self
542 }
543
544 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
545 pub fn set_created_by(mut self, value: String) -> Self {
546 self.created_by = value;
547 self
548 }
549
550 /// Sets whether the writing of offset indexes is disabled (defaults to `false`).
551 ///
552 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
553 ///
554 /// Note: As the offset indexes are useful for accessing data by row number,
555 /// they are always written by default, regardless of whether other statistics
556 /// are enabled. Disabling this metadata may result in a degradation in read
557 /// performance, so use this option with care.
558 ///
559 /// [`Page`]: EnabledStatistics::Page
560 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
561 self.offset_index_disabled = value;
562 self
563 }
564
565 /// Sets "key_value_metadata" property (defaults to `None`).
566 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
567 self.key_value_metadata = value;
568 self
569 }
570
571 /// Sets sorting order of rows in the row group if any (defaults to `None`).
572 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
573 self.sorting_columns = value;
574 self
575 }
576
577 // ----------------------------------------------------------------------
578 // Setters for any column (global)
579
580 /// Sets default encoding for all columns.
581 ///
582 /// If dictionary is not enabled, this is treated as a primary encoding for all
583 /// columns. In case when dictionary is enabled for any column, this value is
584 /// considered to be a fallback encoding for that column.
585 ///
586 /// # Panics
587 ///
588 /// if dictionary encoding is specified, regardless of dictionary
589 /// encoding flag being set.
590 pub fn set_encoding(mut self, value: Encoding) -> Self {
591 self.default_column_properties.set_encoding(value);
592 self
593 }
594
595 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
596 ///
597 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
598 pub fn set_compression(mut self, value: Compression) -> Self {
599 self.default_column_properties.set_compression(value);
600 self
601 }
602
603 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
604 ///
605 /// Use this method to set dictionary encoding, instead of explicitly specifying
606 /// encoding in `set_encoding` method.
607 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
608 self.default_column_properties.set_dictionary_enabled(value);
609 self
610 }
611
612 /// Sets default statistics level for all columns (defaults to [`Page`]).
613 ///
614 /// [`Page`]: EnabledStatistics::Page
615 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
616 self.default_column_properties.set_statistics_enabled(value);
617 self
618 }
619
620 /// Sets default max statistics size for all columns (defaults to `4096`).
621 ///
622 /// Applicable only if statistics are enabled.
623 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
624 pub fn set_max_statistics_size(mut self, value: usize) -> Self {
625 #[allow(deprecated)]
626 self.default_column_properties
627 .set_max_statistics_size(value);
628 self
629 }
630
631 /// Sets if bloom filter is enabled by default for all columns (defaults to `false`).
632 ///
633 /// # Notes
634 ///
635 /// * If the bloom filter is enabled previously then it is a no-op.
636 ///
637 /// * If the bloom filter is not enabled, default values for ndv and fpp
638 /// value are used used. See [`set_bloom_filter_ndv`] and
639 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
640 ///
641 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
642 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
643 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
644 self.default_column_properties
645 .set_bloom_filter_enabled(value);
646 self
647 }
648
649 /// Sets the default target bloom filter false positive probability (fpp)
650 /// for all columns (defaults to `0.05`).
651 ///
652 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
653 /// been called.
654 ///
655 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
656 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
657 self.default_column_properties.set_bloom_filter_fpp(value);
658 self
659 }
660
661 /// Sets default number of distinct values (ndv) for bloom filter for all
662 /// columns (defaults to `1_000_000`).
663 ///
664 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
665 /// been called.
666 ///
667 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
668 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
669 self.default_column_properties.set_bloom_filter_ndv(value);
670 self
671 }
672
673 // ----------------------------------------------------------------------
674 // Setters for a specific column
675
676 /// Helper method to get existing or new mutable reference of column properties.
677 #[inline]
678 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
679 self.column_properties.entry(col).or_default()
680 }
681
682 /// Sets encoding for a specific column.
683 ///
684 /// Takes precedence over [`Self::set_encoding`].
685 ///
686 /// If dictionary is not enabled, this is treated as a primary encoding for this
687 /// column. In case when dictionary is enabled for this column, either through
688 /// global defaults or explicitly, this value is considered to be a fallback
689 /// encoding for this column.
690 ///
691 /// # Panics
692 /// If user tries to set dictionary encoding here, regardless of dictionary
693 /// encoding flag being set.
694 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
695 self.get_mut_props(col).set_encoding(value);
696 self
697 }
698
699 /// Sets compression codec for a specific column.
700 ///
701 /// Takes precedence over [`Self::set_compression`].
702 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
703 self.get_mut_props(col).set_compression(value);
704 self
705 }
706
707 /// Sets flag to enable/disable dictionary encoding for a specific column.
708 ///
709 /// Takes precedence over [`Self::set_dictionary_enabled`].
710 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
711 self.get_mut_props(col).set_dictionary_enabled(value);
712 self
713 }
714
715 /// Sets statistics level for a specific column.
716 ///
717 /// Takes precedence over [`Self::set_statistics_enabled`].
718 pub fn set_column_statistics_enabled(
719 mut self,
720 col: ColumnPath,
721 value: EnabledStatistics,
722 ) -> Self {
723 self.get_mut_props(col).set_statistics_enabled(value);
724 self
725 }
726
727 /// Sets max size for statistics for a specific column.
728 ///
729 /// Takes precedence over [`Self::set_max_statistics_size`].
730 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
731 pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
732 #[allow(deprecated)]
733 self.get_mut_props(col).set_max_statistics_size(value);
734 self
735 }
736
737 /// Sets whether a bloom filter should be written for a specific column.
738 ///
739 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
740 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
741 self.get_mut_props(col).set_bloom_filter_enabled(value);
742 self
743 }
744
745 /// Sets the false positive probability for bloom filter for a specific column.
746 ///
747 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
748 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
749 self.get_mut_props(col).set_bloom_filter_fpp(value);
750 self
751 }
752
753 /// Sets the number of distinct values for bloom filter for a specific column.
754 ///
755 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
756 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
757 self.get_mut_props(col).set_bloom_filter_ndv(value);
758 self
759 }
760
761 /// Sets the max length of min/max value fields when writing the column
762 /// [`Index`] (defaults to `None`).
763 ///
764 /// This can be used to prevent columns with very long values (hundreds of
765 /// bytes long) from causing the parquet metadata to become huge.
766 ///
767 /// # Notes
768 ///
769 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
770 /// set to [`EnabledStatistics::Page`].
771 ///
772 /// * If `Some`, must be greater than 0, otherwise will panic
773 /// * If `None`, there's no effective limit.
774 ///
775 /// [`Index`]: crate::file::page_index::index::Index
776 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
777 if let Some(value) = max_length {
778 assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
779 }
780
781 self.column_index_truncate_length = max_length;
782 self
783 }
784
785 /// Sets the max length of min/max value fields in row group level
786 /// [`Statistics`] (defaults to `None`).
787 ///
788 /// # Notes
789 /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
790 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
791 ///
792 /// * If `Some`, must be greater than 0, otherwise will panic
793 /// * If `None`, there's no effective limit.
794 ///
795 /// [`Statistics`]: crate::file::statistics::Statistics
796 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
797 if let Some(value) = max_length {
798 assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
799 }
800
801 self.statistics_truncate_length = max_length;
802 self
803 }
804
805 /// Should the writer coerce types to parquet native types (defaults to `false`).
806 ///
807 /// Leaving this option the default `false` will ensure the exact same data
808 /// written to parquet using this library will be read.
809 ///
810 /// Setting this option to `true` will result in parquet files that can be
811 /// read by more readers, but potentially lose information in the process.
812 ///
813 /// * Types such as [`DataType::Date64`], which have no direct corresponding
814 /// Parquet type, may be stored with lower precision.
815 ///
816 /// * The internal field names of `List` and `Map` types will be renamed if
817 /// necessary to match what is required by the newest Parquet specification.
818 ///
819 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
820 ///
821 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
822 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
823 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
824 self.coerce_types = coerce_types;
825 self
826 }
827
828 /// Sets FileEncryptionProperties.
829 #[cfg(feature = "encryption")]
830 pub fn with_file_encryption_properties(
831 mut self,
832 file_encryption_properties: FileEncryptionProperties,
833 ) -> Self {
834 self.file_encryption_properties = Some(file_encryption_properties);
835 self
836 }
837}
838
839/// Controls the level of statistics to be computed by the writer and stored in
840/// the parquet file.
841///
842/// Enabling statistics makes the resulting Parquet file larger and requires
843/// more time to read the parquet footer.
844///
845/// Statistics can be used to improve query performance by pruning row groups
846/// and pages during query execution if the query engine supports evaluating the
847/// predicate using the statistics.
848#[derive(Debug, Clone, Copy, Eq, PartialEq)]
849pub enum EnabledStatistics {
850 /// Compute no statistics.
851 None,
852 /// Compute column chunk-level statistics but not page-level.
853 ///
854 /// Setting this option will store one set of statistics for each relevant
855 /// column for each row group. The more row groups written, the more
856 /// statistics will be stored.
857 Chunk,
858 /// Compute page-level and column chunk-level statistics.
859 ///
860 /// Setting this option will store one set of statistics for each relevant
861 /// column for each page and row group. The more row groups and the more
862 /// pages written, the more statistics will be stored.
863 Page,
864}
865
866impl FromStr for EnabledStatistics {
867 type Err = String;
868
869 fn from_str(s: &str) -> Result<Self, Self::Err> {
870 match s {
871 "NONE" | "none" => Ok(EnabledStatistics::None),
872 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
873 "PAGE" | "page" => Ok(EnabledStatistics::Page),
874 _ => Err(format!("Invalid statistics arg: {}", s)),
875 }
876 }
877}
878
879impl Default for EnabledStatistics {
880 fn default() -> Self {
881 DEFAULT_STATISTICS_ENABLED
882 }
883}
884
885/// Controls the bloom filter to be computed by the writer.
886#[derive(Debug, Clone, PartialEq)]
887pub struct BloomFilterProperties {
888 /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
889 ///
890 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
891 ///
892 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
893 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
894 /// e.g. 0.1, 0.05, or 0.001 is recommended.
895 ///
896 /// Setting to very small number diminishes the value of the filter itself, as the bitset size is
897 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
898 /// be known in advance in order to largely reduce space usage.
899 pub fpp: f64,
900 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
901 ///
902 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
903 ///
904 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
905 /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller
906 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
907 /// anyway.
908 ///
909 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
910 pub ndv: u64,
911}
912
913impl Default for BloomFilterProperties {
914 fn default() -> Self {
915 BloomFilterProperties {
916 fpp: DEFAULT_BLOOM_FILTER_FPP,
917 ndv: DEFAULT_BLOOM_FILTER_NDV,
918 }
919 }
920}
921
922/// Container for column properties that can be changed as part of writer.
923///
924/// If a field is `None`, it means that no specific value has been set for this column,
925/// so some subsequent or default value must be used.
926#[derive(Debug, Clone, Default, PartialEq)]
927struct ColumnProperties {
928 encoding: Option<Encoding>,
929 codec: Option<Compression>,
930 dictionary_enabled: Option<bool>,
931 statistics_enabled: Option<EnabledStatistics>,
932 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
933 max_statistics_size: Option<usize>,
934 /// bloom filter related properties
935 bloom_filter_properties: Option<BloomFilterProperties>,
936}
937
938impl ColumnProperties {
939 /// Sets encoding for this column.
940 ///
941 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
942 /// In case when dictionary is enabled for a column, this value is considered to
943 /// be a fallback encoding.
944 ///
945 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
946 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
947 /// for a column.
948 fn set_encoding(&mut self, value: Encoding) {
949 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
950 panic!("Dictionary encoding can not be used as fallback encoding");
951 }
952 self.encoding = Some(value);
953 }
954
955 /// Sets compression codec for this column.
956 fn set_compression(&mut self, value: Compression) {
957 self.codec = Some(value);
958 }
959
960 /// Sets whether or not dictionary encoding is enabled for this column.
961 fn set_dictionary_enabled(&mut self, enabled: bool) {
962 self.dictionary_enabled = Some(enabled);
963 }
964
965 /// Sets the statistics level for this column.
966 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
967 self.statistics_enabled = Some(enabled);
968 }
969
970 /// Sets max size for statistics for this column.
971 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
972 #[allow(deprecated)]
973 fn set_max_statistics_size(&mut self, value: usize) {
974 self.max_statistics_size = Some(value);
975 }
976
977 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
978 /// otherwise it is a no-op.
979 /// If `value` is `false`, resets bloom filter properties to `None`.
980 fn set_bloom_filter_enabled(&mut self, value: bool) {
981 if value && self.bloom_filter_properties.is_none() {
982 self.bloom_filter_properties = Some(Default::default())
983 } else if !value {
984 self.bloom_filter_properties = None
985 }
986 }
987
988 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
989 /// bloom filter if not previously enabled.
990 ///
991 /// # Panics
992 ///
993 /// Panics if the `value` is not between 0 and 1 exclusive
994 fn set_bloom_filter_fpp(&mut self, value: f64) {
995 assert!(
996 value > 0. && value < 1.0,
997 "fpp must be between 0 and 1 exclusive, got {value}"
998 );
999
1000 self.bloom_filter_properties
1001 .get_or_insert_with(Default::default)
1002 .fpp = value;
1003 }
1004
1005 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1006 /// enables bloom filter if not previously enabled.
1007 fn set_bloom_filter_ndv(&mut self, value: u64) {
1008 self.bloom_filter_properties
1009 .get_or_insert_with(Default::default)
1010 .ndv = value;
1011 }
1012
1013 /// Returns optional encoding for this column.
1014 fn encoding(&self) -> Option<Encoding> {
1015 self.encoding
1016 }
1017
1018 /// Returns optional compression codec for this column.
1019 fn compression(&self) -> Option<Compression> {
1020 self.codec
1021 }
1022
1023 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1024 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1025 /// been provided.
1026 fn dictionary_enabled(&self) -> Option<bool> {
1027 self.dictionary_enabled
1028 }
1029
1030 /// Returns optional statistics level requested for this column. If result is `None`,
1031 /// then no setting has been provided.
1032 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1033 self.statistics_enabled
1034 }
1035
1036 /// Returns optional max size in bytes for statistics.
1037 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1038 fn max_statistics_size(&self) -> Option<usize> {
1039 #[allow(deprecated)]
1040 self.max_statistics_size
1041 }
1042
1043 /// Returns the bloom filter properties, or `None` if not enabled
1044 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1045 self.bloom_filter_properties.as_ref()
1046 }
1047}
1048
1049/// Reference counted reader properties.
1050pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1051
1052const DEFAULT_READ_BLOOM_FILTER: bool = false;
1053
1054/// Configuration settings for reading parquet files.
1055///
1056/// All properties are immutable and `Send` + `Sync`.
1057/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1058///
1059/// # Example
1060///
1061/// ```rust
1062/// use parquet::file::properties::ReaderProperties;
1063///
1064/// // Create properties with default configuration.
1065/// let props = ReaderProperties::builder().build();
1066///
1067/// // Use properties builder to set certain options and assemble the configuration.
1068/// let props = ReaderProperties::builder()
1069/// .set_backward_compatible_lz4(false)
1070/// .build();
1071/// ```
1072pub struct ReaderProperties {
1073 codec_options: CodecOptions,
1074 read_bloom_filter: bool,
1075}
1076
1077impl ReaderProperties {
1078 /// Returns builder for reader properties with default values.
1079 pub fn builder() -> ReaderPropertiesBuilder {
1080 ReaderPropertiesBuilder::with_defaults()
1081 }
1082
1083 /// Returns codec options.
1084 pub(crate) fn codec_options(&self) -> &CodecOptions {
1085 &self.codec_options
1086 }
1087
1088 /// Returns whether to read bloom filter
1089 pub(crate) fn read_bloom_filter(&self) -> bool {
1090 self.read_bloom_filter
1091 }
1092}
1093
1094/// Builder for parquet file reader configuration. See example on
1095/// [`ReaderProperties`]
1096pub struct ReaderPropertiesBuilder {
1097 codec_options_builder: CodecOptionsBuilder,
1098 read_bloom_filter: Option<bool>,
1099}
1100
1101/// Reader properties builder.
1102impl ReaderPropertiesBuilder {
1103 /// Returns default state of the builder.
1104 fn with_defaults() -> Self {
1105 Self {
1106 codec_options_builder: CodecOptionsBuilder::default(),
1107 read_bloom_filter: None,
1108 }
1109 }
1110
1111 /// Finalizes the configuration and returns immutable reader properties struct.
1112 pub fn build(self) -> ReaderProperties {
1113 ReaderProperties {
1114 codec_options: self.codec_options_builder.build(),
1115 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1116 }
1117 }
1118
1119 /// Enable/disable backward compatible LZ4.
1120 ///
1121 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1122 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1123 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1124 /// compatibility with files generated by older versions of parquet-cpp.
1125 ///
1126 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1127 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1128 self.codec_options_builder = self
1129 .codec_options_builder
1130 .set_backward_compatible_lz4(value);
1131 self
1132 }
1133
1134 /// Enable/disable reading bloom filter
1135 ///
1136 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1137 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1138 ///
1139 /// By default bloom filter is set to be read.
1140 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1141 self.read_bloom_filter = Some(value);
1142 self
1143 }
1144}
1145
1146#[cfg(test)]
1147mod tests {
1148 use super::*;
1149
1150 #[test]
1151 fn test_writer_version() {
1152 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1153 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1154 }
1155
1156 #[test]
1157 fn test_writer_properties_default_settings() {
1158 let props = WriterProperties::default();
1159 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1160 assert_eq!(
1161 props.dictionary_page_size_limit(),
1162 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1163 );
1164 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1165 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1166 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1167 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1168 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1169 assert_eq!(props.key_value_metadata(), None);
1170 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1171 assert_eq!(
1172 props.compression(&ColumnPath::from("col")),
1173 DEFAULT_COMPRESSION
1174 );
1175 assert_eq!(
1176 props.dictionary_enabled(&ColumnPath::from("col")),
1177 DEFAULT_DICTIONARY_ENABLED
1178 );
1179 assert_eq!(
1180 props.statistics_enabled(&ColumnPath::from("col")),
1181 DEFAULT_STATISTICS_ENABLED
1182 );
1183 assert!(props
1184 .bloom_filter_properties(&ColumnPath::from("col"))
1185 .is_none());
1186 }
1187
1188 #[test]
1189 fn test_writer_properties_dictionary_encoding() {
1190 // dictionary encoding is not configurable, and it should be the same for both
1191 // writer version 1 and 2.
1192 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1193 let props = WriterProperties::builder()
1194 .set_writer_version(*version)
1195 .build();
1196 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1197 assert_eq!(
1198 props.dictionary_data_page_encoding(),
1199 Encoding::RLE_DICTIONARY
1200 );
1201 }
1202 }
1203
1204 #[test]
1205 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1206 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1207 // Should panic when user specifies dictionary encoding as fallback encoding.
1208 WriterProperties::builder()
1209 .set_encoding(Encoding::PLAIN_DICTIONARY)
1210 .build();
1211 }
1212
1213 #[test]
1214 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1215 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1216 // Should panic when user specifies dictionary encoding as fallback encoding.
1217 WriterProperties::builder()
1218 .set_encoding(Encoding::RLE_DICTIONARY)
1219 .build();
1220 }
1221
1222 #[test]
1223 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1224 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1225 WriterProperties::builder()
1226 .set_dictionary_enabled(true)
1227 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1228 .build();
1229 }
1230
1231 #[test]
1232 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1233 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1234 WriterProperties::builder()
1235 .set_dictionary_enabled(false)
1236 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1237 .build();
1238 }
1239
1240 #[test]
1241 fn test_writer_properties_builder() {
1242 let props = WriterProperties::builder()
1243 // file settings
1244 .set_writer_version(WriterVersion::PARQUET_2_0)
1245 .set_data_page_size_limit(10)
1246 .set_dictionary_page_size_limit(20)
1247 .set_write_batch_size(30)
1248 .set_max_row_group_size(40)
1249 .set_created_by("default".to_owned())
1250 .set_key_value_metadata(Some(vec![KeyValue::new(
1251 "key".to_string(),
1252 "value".to_string(),
1253 )]))
1254 // global column settings
1255 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1256 .set_compression(Compression::GZIP(Default::default()))
1257 .set_dictionary_enabled(false)
1258 .set_statistics_enabled(EnabledStatistics::None)
1259 // specific column settings
1260 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1261 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1262 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1263 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1264 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1265 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1266 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1267 .build();
1268
1269 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1270 assert_eq!(props.data_page_size_limit(), 10);
1271 assert_eq!(props.dictionary_page_size_limit(), 20);
1272 assert_eq!(props.write_batch_size(), 30);
1273 assert_eq!(props.max_row_group_size(), 40);
1274 assert_eq!(props.created_by(), "default");
1275 assert_eq!(
1276 props.key_value_metadata(),
1277 Some(&vec![
1278 KeyValue::new("key".to_string(), "value".to_string(),)
1279 ])
1280 );
1281
1282 assert_eq!(
1283 props.encoding(&ColumnPath::from("a")),
1284 Some(Encoding::DELTA_BINARY_PACKED)
1285 );
1286 assert_eq!(
1287 props.compression(&ColumnPath::from("a")),
1288 Compression::GZIP(Default::default())
1289 );
1290 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1291 assert_eq!(
1292 props.statistics_enabled(&ColumnPath::from("a")),
1293 EnabledStatistics::None
1294 );
1295
1296 assert_eq!(
1297 props.encoding(&ColumnPath::from("col")),
1298 Some(Encoding::RLE)
1299 );
1300 assert_eq!(
1301 props.compression(&ColumnPath::from("col")),
1302 Compression::SNAPPY
1303 );
1304 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1305 assert_eq!(
1306 props.statistics_enabled(&ColumnPath::from("col")),
1307 EnabledStatistics::Chunk
1308 );
1309 assert_eq!(
1310 props.bloom_filter_properties(&ColumnPath::from("col")),
1311 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1312 );
1313 }
1314
1315 #[test]
1316 fn test_writer_properties_builder_partial_defaults() {
1317 let props = WriterProperties::builder()
1318 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1319 .set_compression(Compression::GZIP(Default::default()))
1320 .set_bloom_filter_enabled(true)
1321 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1322 .build();
1323
1324 assert_eq!(
1325 props.encoding(&ColumnPath::from("col")),
1326 Some(Encoding::RLE)
1327 );
1328 assert_eq!(
1329 props.compression(&ColumnPath::from("col")),
1330 Compression::GZIP(Default::default())
1331 );
1332 assert_eq!(
1333 props.dictionary_enabled(&ColumnPath::from("col")),
1334 DEFAULT_DICTIONARY_ENABLED
1335 );
1336 assert_eq!(
1337 props.bloom_filter_properties(&ColumnPath::from("col")),
1338 Some(&BloomFilterProperties {
1339 fpp: 0.05,
1340 ndv: 1_000_000_u64
1341 })
1342 );
1343 }
1344
1345 #[test]
1346 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1347 assert_eq!(
1348 WriterProperties::builder()
1349 .build()
1350 .bloom_filter_properties(&ColumnPath::from("col")),
1351 None
1352 );
1353 assert_eq!(
1354 WriterProperties::builder()
1355 .set_bloom_filter_ndv(100)
1356 .build()
1357 .bloom_filter_properties(&ColumnPath::from("col")),
1358 Some(&BloomFilterProperties {
1359 fpp: 0.05,
1360 ndv: 100
1361 })
1362 );
1363 assert_eq!(
1364 WriterProperties::builder()
1365 .set_bloom_filter_fpp(0.1)
1366 .build()
1367 .bloom_filter_properties(&ColumnPath::from("col")),
1368 Some(&BloomFilterProperties {
1369 fpp: 0.1,
1370 ndv: 1_000_000_u64
1371 })
1372 );
1373 }
1374
1375 #[test]
1376 fn test_reader_properties_default_settings() {
1377 let props = ReaderProperties::builder().build();
1378
1379 let codec_options = CodecOptionsBuilder::default()
1380 .set_backward_compatible_lz4(true)
1381 .build();
1382
1383 assert_eq!(props.codec_options(), &codec_options);
1384 assert!(!props.read_bloom_filter());
1385 }
1386
1387 #[test]
1388 fn test_reader_properties_builder() {
1389 let props = ReaderProperties::builder()
1390 .set_backward_compatible_lz4(false)
1391 .build();
1392
1393 let codec_options = CodecOptionsBuilder::default()
1394 .set_backward_compatible_lz4(false)
1395 .build();
1396
1397 assert_eq!(props.codec_options(), &codec_options);
1398 }
1399
1400 #[test]
1401 fn test_parse_writerversion() {
1402 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1403 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1404 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1405 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1406
1407 // test lowercase
1408 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1409 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1410
1411 // test invalid version
1412 match "PARQUET_-1_0".parse::<WriterVersion>() {
1413 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1414 Err(e) => {
1415 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1416 }
1417 }
1418 }
1419
1420 #[test]
1421 fn test_parse_enabledstatistics() {
1422 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1423 assert_eq!(enabled_statistics, EnabledStatistics::None);
1424 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1425 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1426 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1427 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1428
1429 // test lowercase
1430 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1431 assert_eq!(enabled_statistics, EnabledStatistics::None);
1432
1433 //test invalid statistics
1434 match "ChunkAndPage".parse::<EnabledStatistics>() {
1435 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1436 Err(e) => {
1437 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1438 }
1439 }
1440 }
1441}