parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21use crate::file::metadata::KeyValue;
22use crate::format::SortingColumn;
23use crate::schema::types::ColumnPath;
24use std::str::FromStr;
25use std::{collections::HashMap, sync::Arc};
26
27/// Default value for [`WriterProperties::data_page_size_limit`]
28pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
29/// Default value for [`WriterProperties::write_batch_size`]
30pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
31/// Default value for [`WriterProperties::writer_version`]
32pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
33/// Default value for [`WriterProperties::compression`]
34pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
35/// Default value for [`WriterProperties::dictionary_enabled`]
36pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
37/// Default value for [`WriterProperties::dictionary_page_size_limit`]
38pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
39/// Default value for [`WriterProperties::data_page_row_count_limit`]
40pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
41/// Default value for [`WriterProperties::statistics_enabled`]
42pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
43/// Default value for [`WriterProperties::max_statistics_size`]
44#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
45pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
46/// Default value for [`WriterProperties::max_row_group_size`]
47pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
60/// Default value for [`WriterProperties::offset_index_disabled`]
61pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
62/// Default values for [`WriterProperties::coerce_types`]
63pub const DEFAULT_COERCE_TYPES: bool = false;
64
65/// Parquet writer version.
66///
67/// Basic constant, which is not part of the Thrift definition.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69#[allow(non_camel_case_types)]
70pub enum WriterVersion {
71 /// Parquet format version 1.0
72 PARQUET_1_0,
73 /// Parquet format version 2.0
74 PARQUET_2_0,
75}
76
77impl WriterVersion {
78 /// Returns writer version as `i32`.
79 pub fn as_num(&self) -> i32 {
80 match self {
81 WriterVersion::PARQUET_1_0 => 1,
82 WriterVersion::PARQUET_2_0 => 2,
83 }
84 }
85}
86
87impl FromStr for WriterVersion {
88 type Err = String;
89
90 fn from_str(s: &str) -> Result<Self, Self::Err> {
91 match s {
92 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
93 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
94 _ => Err(format!("Invalid writer version: {}", s)),
95 }
96 }
97}
98
99/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
100/// write Bloom filters
101///
102/// Basic constant, which is not part of the Thrift definition.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum BloomFilterPosition {
105 /// Write Bloom Filters of each row group right after the row group
106 ///
107 /// This saves memory by writing it as soon as it is computed, at the cost
108 /// of data locality for readers
109 AfterRowGroup,
110 /// Write Bloom Filters at the end of the file
111 ///
112 /// This allows better data locality for readers, at the cost of memory usage
113 /// for writers.
114 End,
115}
116
117/// Reference counted writer properties.
118pub type WriterPropertiesPtr = Arc<WriterProperties>;
119
120/// Configuration settings for writing parquet files.
121///
122/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
123///
124/// # Example
125///
126/// ```rust
127/// # use parquet::{
128/// # basic::{Compression, Encoding},
129/// # file::properties::*,
130/// # schema::types::ColumnPath,
131/// # };
132/// #
133/// // Create properties with default configuration.
134/// let props = WriterProperties::default();
135///
136/// // Use properties builder to set certain options and assemble the configuration.
137/// let props = WriterProperties::builder()
138/// .set_writer_version(WriterVersion::PARQUET_1_0)
139/// .set_encoding(Encoding::PLAIN)
140/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
141/// .set_compression(Compression::SNAPPY)
142/// .build();
143///
144/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
145/// assert_eq!(
146/// props.encoding(&ColumnPath::from("col1")),
147/// Some(Encoding::DELTA_BINARY_PACKED)
148/// );
149/// assert_eq!(
150/// props.encoding(&ColumnPath::from("col2")),
151/// Some(Encoding::PLAIN)
152/// );
153/// ```
154#[derive(Debug, Clone)]
155pub struct WriterProperties {
156 data_page_size_limit: usize,
157 dictionary_page_size_limit: usize,
158 data_page_row_count_limit: usize,
159 write_batch_size: usize,
160 max_row_group_size: usize,
161 bloom_filter_position: BloomFilterPosition,
162 writer_version: WriterVersion,
163 created_by: String,
164 offset_index_disabled: bool,
165 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
166 default_column_properties: ColumnProperties,
167 column_properties: HashMap<ColumnPath, ColumnProperties>,
168 sorting_columns: Option<Vec<SortingColumn>>,
169 column_index_truncate_length: Option<usize>,
170 statistics_truncate_length: Option<usize>,
171 coerce_types: bool,
172}
173
174impl Default for WriterProperties {
175 fn default() -> Self {
176 Self::builder().build()
177 }
178}
179
180impl WriterProperties {
181 /// Create a new [`WriterProperties`] with the default settings
182 ///
183 /// See [`WriterProperties::builder`] for customising settings
184 pub fn new() -> Self {
185 Self::default()
186 }
187
188 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
189 /// properties.
190 pub fn builder() -> WriterPropertiesBuilder {
191 WriterPropertiesBuilder::with_defaults()
192 }
193
194 /// Returns data page size limit.
195 ///
196 /// Note: this is a best effort limit based on the write batch size
197 ///
198 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
199 pub fn data_page_size_limit(&self) -> usize {
200 self.data_page_size_limit
201 }
202
203 /// Returns dictionary page size limit.
204 ///
205 /// Note: this is a best effort limit based on the write batch size
206 ///
207 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
208 pub fn dictionary_page_size_limit(&self) -> usize {
209 self.dictionary_page_size_limit
210 }
211
212 /// Returns the maximum page row count
213 ///
214 /// Note: this is a best effort limit based on the write batch size
215 ///
216 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
217 pub fn data_page_row_count_limit(&self) -> usize {
218 self.data_page_row_count_limit
219 }
220
221 /// Returns configured batch size for writes.
222 ///
223 /// When writing a batch of data, this setting allows to split it internally into
224 /// smaller batches so we can better estimate the size of a page currently being
225 /// written.
226 pub fn write_batch_size(&self) -> usize {
227 self.write_batch_size
228 }
229
230 /// Returns maximum number of rows in a row group.
231 pub fn max_row_group_size(&self) -> usize {
232 self.max_row_group_size
233 }
234
235 /// Returns maximum number of rows in a row group.
236 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
237 self.bloom_filter_position
238 }
239
240 /// Returns configured writer version.
241 pub fn writer_version(&self) -> WriterVersion {
242 self.writer_version
243 }
244
245 /// Returns `created_by` string.
246 pub fn created_by(&self) -> &str {
247 &self.created_by
248 }
249
250 /// Returns `true` if offset index writing is disabled.
251 pub fn offset_index_disabled(&self) -> bool {
252 // If page statistics are to be collected, then do not disable the offset indexes.
253 let default_page_stats_enabled =
254 self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
255 let column_page_stats_enabled = self
256 .column_properties
257 .iter()
258 .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
259 if default_page_stats_enabled || column_page_stats_enabled {
260 return false;
261 }
262
263 self.offset_index_disabled
264 }
265
266 /// Returns `key_value_metadata` KeyValue pairs.
267 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
268 self.key_value_metadata.as_ref()
269 }
270
271 /// Returns sorting columns.
272 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
273 self.sorting_columns.as_ref()
274 }
275
276 /// Returns the maximum length of truncated min/max values in the column index.
277 ///
278 /// `None` if truncation is disabled, must be greater than 0 otherwise.
279 pub fn column_index_truncate_length(&self) -> Option<usize> {
280 self.column_index_truncate_length
281 }
282
283 /// Returns the maximum length of truncated min/max values in statistics.
284 ///
285 /// `None` if truncation is disabled, must be greater than 0 otherwise.
286 pub fn statistics_truncate_length(&self) -> Option<usize> {
287 self.statistics_truncate_length
288 }
289
290 /// Returns `true` if type coercion is enabled.
291 pub fn coerce_types(&self) -> bool {
292 self.coerce_types
293 }
294
295 /// Returns encoding for a data page, when dictionary encoding is enabled.
296 /// This is not configurable.
297 #[inline]
298 pub fn dictionary_data_page_encoding(&self) -> Encoding {
299 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
300 // Dictionary values are encoded using RLE_DICTIONARY encoding.
301 Encoding::RLE_DICTIONARY
302 }
303
304 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
305 /// This is not configurable.
306 #[inline]
307 pub fn dictionary_page_encoding(&self) -> Encoding {
308 // PLAIN_DICTIONARY is deprecated in writer version 1.
309 // Dictionary is encoded using plain encoding.
310 Encoding::PLAIN
311 }
312
313 /// Returns encoding for a column, if set.
314 /// In case when dictionary is enabled, returns fallback encoding.
315 ///
316 /// If encoding is not set, then column writer will choose the best encoding
317 /// based on the column type.
318 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
319 self.column_properties
320 .get(col)
321 .and_then(|c| c.encoding())
322 .or_else(|| self.default_column_properties.encoding())
323 }
324
325 /// Returns compression codec for a column.
326 pub fn compression(&self, col: &ColumnPath) -> Compression {
327 self.column_properties
328 .get(col)
329 .and_then(|c| c.compression())
330 .or_else(|| self.default_column_properties.compression())
331 .unwrap_or(DEFAULT_COMPRESSION)
332 }
333
334 /// Returns `true` if dictionary encoding is enabled for a column.
335 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
336 self.column_properties
337 .get(col)
338 .and_then(|c| c.dictionary_enabled())
339 .or_else(|| self.default_column_properties.dictionary_enabled())
340 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
341 }
342
343 /// Returns which statistics are written for a column.
344 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
345 self.column_properties
346 .get(col)
347 .and_then(|c| c.statistics_enabled())
348 .or_else(|| self.default_column_properties.statistics_enabled())
349 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
350 }
351
352 /// Returns max size for statistics.
353 /// Only applicable if statistics are enabled.
354 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
355 pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
356 #[allow(deprecated)]
357 self.column_properties
358 .get(col)
359 .and_then(|c| c.max_statistics_size())
360 .or_else(|| self.default_column_properties.max_statistics_size())
361 .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
362 }
363
364 /// Returns the [`BloomFilterProperties`] for the given column
365 ///
366 /// Returns `None` if bloom filter is disabled
367 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
368 self.column_properties
369 .get(col)
370 .and_then(|c| c.bloom_filter_properties())
371 .or_else(|| self.default_column_properties.bloom_filter_properties())
372 }
373}
374
375/// Builder for [`WriterProperties`] parquet writer configuration.
376///
377/// See example on [`WriterProperties`]
378pub struct WriterPropertiesBuilder {
379 data_page_size_limit: usize,
380 dictionary_page_size_limit: usize,
381 data_page_row_count_limit: usize,
382 write_batch_size: usize,
383 max_row_group_size: usize,
384 bloom_filter_position: BloomFilterPosition,
385 writer_version: WriterVersion,
386 created_by: String,
387 offset_index_disabled: bool,
388 key_value_metadata: Option<Vec<KeyValue>>,
389 default_column_properties: ColumnProperties,
390 column_properties: HashMap<ColumnPath, ColumnProperties>,
391 sorting_columns: Option<Vec<SortingColumn>>,
392 column_index_truncate_length: Option<usize>,
393 statistics_truncate_length: Option<usize>,
394 coerce_types: bool,
395}
396
397impl WriterPropertiesBuilder {
398 /// Returns default state of the builder.
399 fn with_defaults() -> Self {
400 Self {
401 data_page_size_limit: DEFAULT_PAGE_SIZE,
402 dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
403 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
404 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
405 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
406 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
407 writer_version: DEFAULT_WRITER_VERSION,
408 created_by: DEFAULT_CREATED_BY.to_string(),
409 offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
410 key_value_metadata: None,
411 default_column_properties: Default::default(),
412 column_properties: HashMap::new(),
413 sorting_columns: None,
414 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
415 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
416 coerce_types: DEFAULT_COERCE_TYPES,
417 }
418 }
419
420 /// Finalizes the configuration and returns immutable writer properties struct.
421 pub fn build(self) -> WriterProperties {
422 WriterProperties {
423 data_page_size_limit: self.data_page_size_limit,
424 dictionary_page_size_limit: self.dictionary_page_size_limit,
425 data_page_row_count_limit: self.data_page_row_count_limit,
426 write_batch_size: self.write_batch_size,
427 max_row_group_size: self.max_row_group_size,
428 bloom_filter_position: self.bloom_filter_position,
429 writer_version: self.writer_version,
430 created_by: self.created_by,
431 offset_index_disabled: self.offset_index_disabled,
432 key_value_metadata: self.key_value_metadata,
433 default_column_properties: self.default_column_properties,
434 column_properties: self.column_properties,
435 sorting_columns: self.sorting_columns,
436 column_index_truncate_length: self.column_index_truncate_length,
437 statistics_truncate_length: self.statistics_truncate_length,
438 coerce_types: self.coerce_types,
439 }
440 }
441
442 // ----------------------------------------------------------------------
443 // Writer properties related to a file
444
445 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
446 ///
447 /// This value can determine what features some readers will support.
448 ///
449 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
450 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
451 self.writer_version = value;
452 self
453 }
454
455 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
456 ///
457 /// The parquet writer will attempt to limit the sizes of each
458 /// `DataPage` to this many bytes. Reducing this value will result
459 /// in larger parquet files, but may improve the effectiveness of
460 /// page index based predicate pushdown during reading.
461 ///
462 /// Note: this is a best effort limit based on value of
463 /// [`set_write_batch_size`](Self::set_write_batch_size).
464 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
465 self.data_page_size_limit = value;
466 self
467 }
468
469 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
470 ///
471 /// The parquet writer will attempt to limit the number of rows in
472 /// each `DataPage` to this value. Reducing this value will result
473 /// in larger parquet files, but may improve the effectiveness of
474 /// page index based predicate pushdown during reading.
475 ///
476 /// Note: this is a best effort limit based on value of
477 /// [`set_write_batch_size`](Self::set_write_batch_size).
478 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
479 self.data_page_row_count_limit = value;
480 self
481 }
482
483 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
484 ///
485 /// The parquet writer will attempt to limit the size of each
486 /// `DataPage` used to store dictionaries to this many
487 /// bytes. Reducing this value will result in larger parquet
488 /// files, but may improve the effectiveness of page index based
489 /// predicate pushdown during reading.
490 ///
491 /// Note: this is a best effort limit based on value of
492 /// [`set_write_batch_size`](Self::set_write_batch_size).
493 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
494 self.dictionary_page_size_limit = value;
495 self
496 }
497
498 /// Sets write batch size (defaults to 1024).
499 ///
500 /// For performance reasons, data for each column is written in
501 /// batches of this size.
502 ///
503 /// Additional limits such as such as
504 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
505 /// are checked between batches, and thus the write batch size value acts as an
506 /// upper-bound on the enforcement granularity of other limits.
507 pub fn set_write_batch_size(mut self, value: usize) -> Self {
508 self.write_batch_size = value;
509 self
510 }
511
512 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
513 ///
514 /// # Panics
515 /// If the value is set to 0.
516 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
517 assert!(value > 0, "Cannot have a 0 max row group size");
518 self.max_row_group_size = value;
519 self
520 }
521
522 /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
523 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
524 self.bloom_filter_position = value;
525 self
526 }
527
528 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
529 pub fn set_created_by(mut self, value: String) -> Self {
530 self.created_by = value;
531 self
532 }
533
534 /// Sets whether the writing of offset indexes is disabled (defaults to `false`).
535 ///
536 /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
537 ///
538 /// Note: As the offset indexes are useful for accessing data by row number,
539 /// they are always written by default, regardless of whether other statistics
540 /// are enabled. Disabling this metadata may result in a degradation in read
541 /// performance, so use this option with care.
542 ///
543 /// [`Page`]: EnabledStatistics::Page
544 pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
545 self.offset_index_disabled = value;
546 self
547 }
548
549 /// Sets "key_value_metadata" property (defaults to `None`).
550 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
551 self.key_value_metadata = value;
552 self
553 }
554
555 /// Sets sorting order of rows in the row group if any (defaults to `None`).
556 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
557 self.sorting_columns = value;
558 self
559 }
560
561 // ----------------------------------------------------------------------
562 // Setters for any column (global)
563
564 /// Sets default encoding for all columns.
565 ///
566 /// If dictionary is not enabled, this is treated as a primary encoding for all
567 /// columns. In case when dictionary is enabled for any column, this value is
568 /// considered to be a fallback encoding for that column.
569 ///
570 /// # Panics
571 ///
572 /// if dictionary encoding is specified, regardless of dictionary
573 /// encoding flag being set.
574 pub fn set_encoding(mut self, value: Encoding) -> Self {
575 self.default_column_properties.set_encoding(value);
576 self
577 }
578
579 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
580 ///
581 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
582 pub fn set_compression(mut self, value: Compression) -> Self {
583 self.default_column_properties.set_compression(value);
584 self
585 }
586
587 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
588 ///
589 /// Use this method to set dictionary encoding, instead of explicitly specifying
590 /// encoding in `set_encoding` method.
591 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
592 self.default_column_properties.set_dictionary_enabled(value);
593 self
594 }
595
596 /// Sets default statistics level for all columns (defaults to [`Page`]).
597 ///
598 /// [`Page`]: EnabledStatistics::Page
599 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
600 self.default_column_properties.set_statistics_enabled(value);
601 self
602 }
603
604 /// Sets default max statistics size for all columns (defaults to `4096`).
605 ///
606 /// Applicable only if statistics are enabled.
607 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
608 pub fn set_max_statistics_size(mut self, value: usize) -> Self {
609 #[allow(deprecated)]
610 self.default_column_properties
611 .set_max_statistics_size(value);
612 self
613 }
614
615 /// Sets if bloom filter is enabled by default for all columns (defaults to `false`).
616 ///
617 /// # Notes
618 ///
619 /// * If the bloom filter is enabled previously then it is a no-op.
620 ///
621 /// * If the bloom filter is not enabled, default values for ndv and fpp
622 /// value are used used. See [`set_bloom_filter_ndv`] and
623 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
624 ///
625 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
626 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
627 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
628 self.default_column_properties
629 .set_bloom_filter_enabled(value);
630 self
631 }
632
633 /// Sets the default target bloom filter false positive probability (fpp)
634 /// for all columns (defaults to `0.05`).
635 ///
636 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
637 /// been called.
638 ///
639 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
640 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
641 self.default_column_properties.set_bloom_filter_fpp(value);
642 self
643 }
644
645 /// Sets default number of distinct values (ndv) for bloom filter for all
646 /// columns (defaults to `1_000_000`).
647 ///
648 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
649 /// been called.
650 ///
651 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
652 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
653 self.default_column_properties.set_bloom_filter_ndv(value);
654 self
655 }
656
657 // ----------------------------------------------------------------------
658 // Setters for a specific column
659
660 /// Helper method to get existing or new mutable reference of column properties.
661 #[inline]
662 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
663 self.column_properties.entry(col).or_default()
664 }
665
666 /// Sets encoding for a specific column.
667 ///
668 /// Takes precedence over [`Self::set_encoding`].
669 ///
670 /// If dictionary is not enabled, this is treated as a primary encoding for this
671 /// column. In case when dictionary is enabled for this column, either through
672 /// global defaults or explicitly, this value is considered to be a fallback
673 /// encoding for this column.
674 ///
675 /// # Panics
676 /// If user tries to set dictionary encoding here, regardless of dictionary
677 /// encoding flag being set.
678 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
679 self.get_mut_props(col).set_encoding(value);
680 self
681 }
682
683 /// Sets compression codec for a specific column.
684 ///
685 /// Takes precedence over [`Self::set_compression`].
686 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
687 self.get_mut_props(col).set_compression(value);
688 self
689 }
690
691 /// Sets flag to enable/disable dictionary encoding for a specific column.
692 ///
693 /// Takes precedence over [`Self::set_dictionary_enabled`].
694 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
695 self.get_mut_props(col).set_dictionary_enabled(value);
696 self
697 }
698
699 /// Sets statistics level for a specific column.
700 ///
701 /// Takes precedence over [`Self::set_statistics_enabled`].
702 pub fn set_column_statistics_enabled(
703 mut self,
704 col: ColumnPath,
705 value: EnabledStatistics,
706 ) -> Self {
707 self.get_mut_props(col).set_statistics_enabled(value);
708 self
709 }
710
711 /// Sets max size for statistics for a specific column.
712 ///
713 /// Takes precedence over [`Self::set_max_statistics_size`].
714 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
715 pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
716 #[allow(deprecated)]
717 self.get_mut_props(col).set_max_statistics_size(value);
718 self
719 }
720
721 /// Sets whether a bloom filter should be written for a specific column.
722 ///
723 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
724 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
725 self.get_mut_props(col).set_bloom_filter_enabled(value);
726 self
727 }
728
729 /// Sets the false positive probability for bloom filter for a specific column.
730 ///
731 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
732 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
733 self.get_mut_props(col).set_bloom_filter_fpp(value);
734 self
735 }
736
737 /// Sets the number of distinct values for bloom filter for a specific column.
738 ///
739 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
740 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
741 self.get_mut_props(col).set_bloom_filter_ndv(value);
742 self
743 }
744
745 /// Sets the max length of min/max value fields when writing the column
746 /// [`Index`] (defaults to `None`).
747 ///
748 /// This can be used to prevent columns with very long values (hundreds of
749 /// bytes long) from causing the parquet metadata to become huge.
750 ///
751 /// # Notes
752 ///
753 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
754 /// set to [`EnabledStatistics::Page`].
755 ///
756 /// * If `Some`, must be greater than 0, otherwise will panic
757 /// * If `None`, there's no effective limit.
758 ///
759 /// [`Index`]: crate::file::page_index::index::Index
760 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
761 if let Some(value) = max_length {
762 assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
763 }
764
765 self.column_index_truncate_length = max_length;
766 self
767 }
768
769 /// Sets the max length of min/max value fields in row group level
770 /// [`Statistics`] (defaults to `None`).
771 ///
772 /// # Notes
773 /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
774 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
775 ///
776 /// * If `Some`, must be greater than 0, otherwise will panic
777 /// * If `None`, there's no effective limit.
778 ///
779 /// [`Statistics`]: crate::file::statistics::Statistics
780 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
781 if let Some(value) = max_length {
782 assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
783 }
784
785 self.statistics_truncate_length = max_length;
786 self
787 }
788
789 /// Should the writer coerce types to parquet native types (defaults to `false`).
790 ///
791 /// Leaving this option the default `false` will ensure the exact same data
792 /// written to parquet using this library will be read.
793 ///
794 /// Setting this option to `true` will result in parquet files that can be
795 /// read by more readers, but potentially lose information in the process.
796 ///
797 /// * Types such as [`DataType::Date64`], which have no direct corresponding
798 /// Parquet type, may be stored with lower precision.
799 ///
800 /// * The internal field names of `List` and `Map` types will be renamed if
801 /// necessary to match what is required by the newest Parquet specification.
802 ///
803 /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
804 ///
805 /// [`DataType::Date64`]: arrow_schema::DataType::Date64
806 /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
807 pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
808 self.coerce_types = coerce_types;
809 self
810 }
811}
812
813/// Controls the level of statistics to be computed by the writer and stored in
814/// the parquet file.
815///
816/// Enabling statistics makes the resulting Parquet file larger and requires
817/// more time to read the parquet footer.
818///
819/// Statistics can be used to improve query performance by pruning row groups
820/// and pages during query execution if the query engine supports evaluating the
821/// predicate using the statistics.
822#[derive(Debug, Clone, Copy, Eq, PartialEq)]
823pub enum EnabledStatistics {
824 /// Compute no statistics.
825 None,
826 /// Compute column chunk-level statistics but not page-level.
827 ///
828 /// Setting this option will store one set of statistics for each relevant
829 /// column for each row group. The more row groups written, the more
830 /// statistics will be stored.
831 Chunk,
832 /// Compute page-level and column chunk-level statistics.
833 ///
834 /// Setting this option will store one set of statistics for each relevant
835 /// column for each page and row group. The more row groups and the more
836 /// pages written, the more statistics will be stored.
837 Page,
838}
839
840impl FromStr for EnabledStatistics {
841 type Err = String;
842
843 fn from_str(s: &str) -> Result<Self, Self::Err> {
844 match s {
845 "NONE" | "none" => Ok(EnabledStatistics::None),
846 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
847 "PAGE" | "page" => Ok(EnabledStatistics::Page),
848 _ => Err(format!("Invalid statistics arg: {}", s)),
849 }
850 }
851}
852
853impl Default for EnabledStatistics {
854 fn default() -> Self {
855 DEFAULT_STATISTICS_ENABLED
856 }
857}
858
859/// Controls the bloom filter to be computed by the writer.
860#[derive(Debug, Clone, PartialEq)]
861pub struct BloomFilterProperties {
862 /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
863 ///
864 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
865 ///
866 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
867 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
868 /// e.g. 0.1, 0.05, or 0.001 is recommended.
869 ///
870 /// Setting to very small number diminishes the value of the filter itself, as the bitset size is
871 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
872 /// be known in advance in order to largely reduce space usage.
873 pub fpp: f64,
874 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
875 ///
876 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
877 ///
878 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
879 /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller
880 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
881 /// anyway.
882 ///
883 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
884 pub ndv: u64,
885}
886
887impl Default for BloomFilterProperties {
888 fn default() -> Self {
889 BloomFilterProperties {
890 fpp: DEFAULT_BLOOM_FILTER_FPP,
891 ndv: DEFAULT_BLOOM_FILTER_NDV,
892 }
893 }
894}
895
896/// Container for column properties that can be changed as part of writer.
897///
898/// If a field is `None`, it means that no specific value has been set for this column,
899/// so some subsequent or default value must be used.
900#[derive(Debug, Clone, Default, PartialEq)]
901struct ColumnProperties {
902 encoding: Option<Encoding>,
903 codec: Option<Compression>,
904 dictionary_enabled: Option<bool>,
905 statistics_enabled: Option<EnabledStatistics>,
906 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
907 max_statistics_size: Option<usize>,
908 /// bloom filter related properties
909 bloom_filter_properties: Option<BloomFilterProperties>,
910}
911
912impl ColumnProperties {
913 /// Sets encoding for this column.
914 ///
915 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
916 /// In case when dictionary is enabled for a column, this value is considered to
917 /// be a fallback encoding.
918 ///
919 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
920 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
921 /// for a column.
922 fn set_encoding(&mut self, value: Encoding) {
923 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
924 panic!("Dictionary encoding can not be used as fallback encoding");
925 }
926 self.encoding = Some(value);
927 }
928
929 /// Sets compression codec for this column.
930 fn set_compression(&mut self, value: Compression) {
931 self.codec = Some(value);
932 }
933
934 /// Sets whether or not dictionary encoding is enabled for this column.
935 fn set_dictionary_enabled(&mut self, enabled: bool) {
936 self.dictionary_enabled = Some(enabled);
937 }
938
939 /// Sets the statistics level for this column.
940 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
941 self.statistics_enabled = Some(enabled);
942 }
943
944 /// Sets max size for statistics for this column.
945 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
946 #[allow(deprecated)]
947 fn set_max_statistics_size(&mut self, value: usize) {
948 self.max_statistics_size = Some(value);
949 }
950
951 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
952 /// otherwise it is a no-op.
953 /// If `value` is `false`, resets bloom filter properties to `None`.
954 fn set_bloom_filter_enabled(&mut self, value: bool) {
955 if value && self.bloom_filter_properties.is_none() {
956 self.bloom_filter_properties = Some(Default::default())
957 } else if !value {
958 self.bloom_filter_properties = None
959 }
960 }
961
962 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
963 /// bloom filter if not previously enabled.
964 ///
965 /// # Panics
966 ///
967 /// Panics if the `value` is not between 0 and 1 exclusive
968 fn set_bloom_filter_fpp(&mut self, value: f64) {
969 assert!(
970 value > 0. && value < 1.0,
971 "fpp must be between 0 and 1 exclusive, got {value}"
972 );
973
974 self.bloom_filter_properties
975 .get_or_insert_with(Default::default)
976 .fpp = value;
977 }
978
979 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
980 /// enables bloom filter if not previously enabled.
981 fn set_bloom_filter_ndv(&mut self, value: u64) {
982 self.bloom_filter_properties
983 .get_or_insert_with(Default::default)
984 .ndv = value;
985 }
986
987 /// Returns optional encoding for this column.
988 fn encoding(&self) -> Option<Encoding> {
989 self.encoding
990 }
991
992 /// Returns optional compression codec for this column.
993 fn compression(&self) -> Option<Compression> {
994 self.codec
995 }
996
997 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
998 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
999 /// been provided.
1000 fn dictionary_enabled(&self) -> Option<bool> {
1001 self.dictionary_enabled
1002 }
1003
1004 /// Returns optional statistics level requested for this column. If result is `None`,
1005 /// then no setting has been provided.
1006 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1007 self.statistics_enabled
1008 }
1009
1010 /// Returns optional max size in bytes for statistics.
1011 #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1012 fn max_statistics_size(&self) -> Option<usize> {
1013 #[allow(deprecated)]
1014 self.max_statistics_size
1015 }
1016
1017 /// Returns the bloom filter properties, or `None` if not enabled
1018 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1019 self.bloom_filter_properties.as_ref()
1020 }
1021}
1022
1023/// Reference counted reader properties.
1024pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1025
1026const DEFAULT_READ_BLOOM_FILTER: bool = false;
1027
1028/// Configuration settings for reading parquet files.
1029///
1030/// All properties are immutable and `Send` + `Sync`.
1031/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1032///
1033/// # Example
1034///
1035/// ```rust
1036/// use parquet::file::properties::ReaderProperties;
1037///
1038/// // Create properties with default configuration.
1039/// let props = ReaderProperties::builder().build();
1040///
1041/// // Use properties builder to set certain options and assemble the configuration.
1042/// let props = ReaderProperties::builder()
1043/// .set_backward_compatible_lz4(false)
1044/// .build();
1045/// ```
1046pub struct ReaderProperties {
1047 codec_options: CodecOptions,
1048 read_bloom_filter: bool,
1049}
1050
1051impl ReaderProperties {
1052 /// Returns builder for reader properties with default values.
1053 pub fn builder() -> ReaderPropertiesBuilder {
1054 ReaderPropertiesBuilder::with_defaults()
1055 }
1056
1057 /// Returns codec options.
1058 pub(crate) fn codec_options(&self) -> &CodecOptions {
1059 &self.codec_options
1060 }
1061
1062 /// Returns whether to read bloom filter
1063 pub(crate) fn read_bloom_filter(&self) -> bool {
1064 self.read_bloom_filter
1065 }
1066}
1067
1068/// Builder for parquet file reader configuration. See example on
1069/// [`ReaderProperties`]
1070pub struct ReaderPropertiesBuilder {
1071 codec_options_builder: CodecOptionsBuilder,
1072 read_bloom_filter: Option<bool>,
1073}
1074
1075/// Reader properties builder.
1076impl ReaderPropertiesBuilder {
1077 /// Returns default state of the builder.
1078 fn with_defaults() -> Self {
1079 Self {
1080 codec_options_builder: CodecOptionsBuilder::default(),
1081 read_bloom_filter: None,
1082 }
1083 }
1084
1085 /// Finalizes the configuration and returns immutable reader properties struct.
1086 pub fn build(self) -> ReaderProperties {
1087 ReaderProperties {
1088 codec_options: self.codec_options_builder.build(),
1089 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1090 }
1091 }
1092
1093 /// Enable/disable backward compatible LZ4.
1094 ///
1095 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1096 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1097 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1098 /// compatibility with files generated by older versions of parquet-cpp.
1099 ///
1100 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1101 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1102 self.codec_options_builder = self
1103 .codec_options_builder
1104 .set_backward_compatible_lz4(value);
1105 self
1106 }
1107
1108 /// Enable/disable reading bloom filter
1109 ///
1110 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1111 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1112 ///
1113 /// By default bloom filter is set to be read.
1114 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1115 self.read_bloom_filter = Some(value);
1116 self
1117 }
1118}
1119
1120#[cfg(test)]
1121mod tests {
1122 use super::*;
1123
1124 #[test]
1125 fn test_writer_version() {
1126 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1127 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1128 }
1129
1130 #[test]
1131 fn test_writer_properties_default_settings() {
1132 let props = WriterProperties::default();
1133 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1134 assert_eq!(
1135 props.dictionary_page_size_limit(),
1136 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1137 );
1138 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1139 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1140 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1141 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1142 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1143 assert_eq!(props.key_value_metadata(), None);
1144 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1145 assert_eq!(
1146 props.compression(&ColumnPath::from("col")),
1147 DEFAULT_COMPRESSION
1148 );
1149 assert_eq!(
1150 props.dictionary_enabled(&ColumnPath::from("col")),
1151 DEFAULT_DICTIONARY_ENABLED
1152 );
1153 assert_eq!(
1154 props.statistics_enabled(&ColumnPath::from("col")),
1155 DEFAULT_STATISTICS_ENABLED
1156 );
1157 assert!(props
1158 .bloom_filter_properties(&ColumnPath::from("col"))
1159 .is_none());
1160 }
1161
1162 #[test]
1163 fn test_writer_properties_dictionary_encoding() {
1164 // dictionary encoding is not configurable, and it should be the same for both
1165 // writer version 1 and 2.
1166 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1167 let props = WriterProperties::builder()
1168 .set_writer_version(*version)
1169 .build();
1170 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1171 assert_eq!(
1172 props.dictionary_data_page_encoding(),
1173 Encoding::RLE_DICTIONARY
1174 );
1175 }
1176 }
1177
1178 #[test]
1179 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1180 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1181 // Should panic when user specifies dictionary encoding as fallback encoding.
1182 WriterProperties::builder()
1183 .set_encoding(Encoding::PLAIN_DICTIONARY)
1184 .build();
1185 }
1186
1187 #[test]
1188 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1189 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1190 // Should panic when user specifies dictionary encoding as fallback encoding.
1191 WriterProperties::builder()
1192 .set_encoding(Encoding::RLE_DICTIONARY)
1193 .build();
1194 }
1195
1196 #[test]
1197 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1198 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1199 WriterProperties::builder()
1200 .set_dictionary_enabled(true)
1201 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1202 .build();
1203 }
1204
1205 #[test]
1206 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1207 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1208 WriterProperties::builder()
1209 .set_dictionary_enabled(false)
1210 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1211 .build();
1212 }
1213
1214 #[test]
1215 fn test_writer_properties_builder() {
1216 let props = WriterProperties::builder()
1217 // file settings
1218 .set_writer_version(WriterVersion::PARQUET_2_0)
1219 .set_data_page_size_limit(10)
1220 .set_dictionary_page_size_limit(20)
1221 .set_write_batch_size(30)
1222 .set_max_row_group_size(40)
1223 .set_created_by("default".to_owned())
1224 .set_key_value_metadata(Some(vec![KeyValue::new(
1225 "key".to_string(),
1226 "value".to_string(),
1227 )]))
1228 // global column settings
1229 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1230 .set_compression(Compression::GZIP(Default::default()))
1231 .set_dictionary_enabled(false)
1232 .set_statistics_enabled(EnabledStatistics::None)
1233 // specific column settings
1234 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1235 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1236 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1237 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1238 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1239 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1240 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1241 .build();
1242
1243 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1244 assert_eq!(props.data_page_size_limit(), 10);
1245 assert_eq!(props.dictionary_page_size_limit(), 20);
1246 assert_eq!(props.write_batch_size(), 30);
1247 assert_eq!(props.max_row_group_size(), 40);
1248 assert_eq!(props.created_by(), "default");
1249 assert_eq!(
1250 props.key_value_metadata(),
1251 Some(&vec![
1252 KeyValue::new("key".to_string(), "value".to_string(),)
1253 ])
1254 );
1255
1256 assert_eq!(
1257 props.encoding(&ColumnPath::from("a")),
1258 Some(Encoding::DELTA_BINARY_PACKED)
1259 );
1260 assert_eq!(
1261 props.compression(&ColumnPath::from("a")),
1262 Compression::GZIP(Default::default())
1263 );
1264 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1265 assert_eq!(
1266 props.statistics_enabled(&ColumnPath::from("a")),
1267 EnabledStatistics::None
1268 );
1269
1270 assert_eq!(
1271 props.encoding(&ColumnPath::from("col")),
1272 Some(Encoding::RLE)
1273 );
1274 assert_eq!(
1275 props.compression(&ColumnPath::from("col")),
1276 Compression::SNAPPY
1277 );
1278 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1279 assert_eq!(
1280 props.statistics_enabled(&ColumnPath::from("col")),
1281 EnabledStatistics::Chunk
1282 );
1283 assert_eq!(
1284 props.bloom_filter_properties(&ColumnPath::from("col")),
1285 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1286 );
1287 }
1288
1289 #[test]
1290 fn test_writer_properties_builder_partial_defaults() {
1291 let props = WriterProperties::builder()
1292 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1293 .set_compression(Compression::GZIP(Default::default()))
1294 .set_bloom_filter_enabled(true)
1295 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1296 .build();
1297
1298 assert_eq!(
1299 props.encoding(&ColumnPath::from("col")),
1300 Some(Encoding::RLE)
1301 );
1302 assert_eq!(
1303 props.compression(&ColumnPath::from("col")),
1304 Compression::GZIP(Default::default())
1305 );
1306 assert_eq!(
1307 props.dictionary_enabled(&ColumnPath::from("col")),
1308 DEFAULT_DICTIONARY_ENABLED
1309 );
1310 assert_eq!(
1311 props.bloom_filter_properties(&ColumnPath::from("col")),
1312 Some(&BloomFilterProperties {
1313 fpp: 0.05,
1314 ndv: 1_000_000_u64
1315 })
1316 );
1317 }
1318
1319 #[test]
1320 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1321 assert_eq!(
1322 WriterProperties::builder()
1323 .build()
1324 .bloom_filter_properties(&ColumnPath::from("col")),
1325 None
1326 );
1327 assert_eq!(
1328 WriterProperties::builder()
1329 .set_bloom_filter_ndv(100)
1330 .build()
1331 .bloom_filter_properties(&ColumnPath::from("col")),
1332 Some(&BloomFilterProperties {
1333 fpp: 0.05,
1334 ndv: 100
1335 })
1336 );
1337 assert_eq!(
1338 WriterProperties::builder()
1339 .set_bloom_filter_fpp(0.1)
1340 .build()
1341 .bloom_filter_properties(&ColumnPath::from("col")),
1342 Some(&BloomFilterProperties {
1343 fpp: 0.1,
1344 ndv: 1_000_000_u64
1345 })
1346 );
1347 }
1348
1349 #[test]
1350 fn test_reader_properties_default_settings() {
1351 let props = ReaderProperties::builder().build();
1352
1353 let codec_options = CodecOptionsBuilder::default()
1354 .set_backward_compatible_lz4(true)
1355 .build();
1356
1357 assert_eq!(props.codec_options(), &codec_options);
1358 assert!(!props.read_bloom_filter());
1359 }
1360
1361 #[test]
1362 fn test_reader_properties_builder() {
1363 let props = ReaderProperties::builder()
1364 .set_backward_compatible_lz4(false)
1365 .build();
1366
1367 let codec_options = CodecOptionsBuilder::default()
1368 .set_backward_compatible_lz4(false)
1369 .build();
1370
1371 assert_eq!(props.codec_options(), &codec_options);
1372 }
1373
1374 #[test]
1375 fn test_parse_writerversion() {
1376 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1377 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1378 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1379 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1380
1381 // test lowercase
1382 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1383 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1384
1385 // test invalid version
1386 match "PARQUET_-1_0".parse::<WriterVersion>() {
1387 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1388 Err(e) => {
1389 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1390 }
1391 }
1392 }
1393
1394 #[test]
1395 fn test_parse_enabledstatistics() {
1396 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1397 assert_eq!(enabled_statistics, EnabledStatistics::None);
1398 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1399 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1400 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1401 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1402
1403 // test lowercase
1404 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1405 assert_eq!(enabled_statistics, EnabledStatistics::None);
1406
1407 //test invalid statistics
1408 match "ChunkAndPage".parse::<EnabledStatistics>() {
1409 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1410 Err(e) => {
1411 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1412 }
1413 }
1414 }
1415}