Struct WriterPropertiesBuilder

Source

pub struct WriterPropertiesBuilder {Show 16 fields
    data_page_size_limit: usize,
    data_page_row_count_limit: usize,
    write_batch_size: usize,
    max_row_group_size: usize,
    bloom_filter_position: BloomFilterPosition,
    writer_version: WriterVersion,
    created_by: String,
    offset_index_disabled: bool,
    key_value_metadata: Option<Vec<KeyValue>>,
    default_column_properties: ColumnProperties,
    column_properties: HashMap<ColumnPath, ColumnProperties>,
    sorting_columns: Option<Vec<SortingColumn>>,
    column_index_truncate_length: Option<usize>,
    statistics_truncate_length: Option<usize>,
    coerce_types: bool,
    file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
}

Expand description

Builder for WriterProperties Parquet writer configuration.

See example on WriterProperties

Fields§

§data_page_size_limit: usize§data_page_row_count_limit: usize§write_batch_size: usize§max_row_group_size: usize§bloom_filter_position: BloomFilterPosition§writer_version: WriterVersion§created_by: String§offset_index_disabled: bool§key_value_metadata: Option<Vec<KeyValue>>§default_column_properties: ColumnProperties§column_properties: HashMap<ColumnPath, ColumnProperties>§sorting_columns: Option<Vec<SortingColumn>>§column_index_truncate_length: Option<usize>§statistics_truncate_length: Option<usize>§coerce_types: bool§file_encryption_properties: Option<Arc<FileEncryptionProperties>>

Implementations§

Source §

impl WriterPropertiesBuilder

Source

pub fn build(self) -> WriterProperties

Finalizes the configuration and returns immutable writer properties struct.

Source

pub fn set_writer_version(self, value: WriterVersion) -> Self

Sets the WriterVersion written into the parquet metadata (defaults to PARQUET_1_0 via DEFAULT_WRITER_VERSION)

This value can determine what features some readers will support.

Source

pub fn set_data_page_size_limit(self, value: usize) -> Self

Sets best effort maximum size of a data page in bytes (defaults to 1024 * 1024 via DEFAULT_PAGE_SIZE).

The parquet writer will attempt to limit the sizes of each DataPage to this many bytes. Reducing this value will result in larger parquet files, but may improve the effectiveness of page index based predicate pushdown during reading.

Note: this is a best effort limit based on value of set_write_batch_size.

Source

pub fn set_data_page_row_count_limit(self, value: usize) -> Self

Sets best effort maximum number of rows in a data page (defaults to 20_000 via DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT).

The parquet writer will attempt to limit the number of rows in each DataPage to this value. Reducing this value will result in larger parquet files, but may improve the effectiveness of page index based predicate pushdown during reading.

Note: this is a best effort limit based on value of set_write_batch_size.

Source

pub fn set_write_batch_size(self, value: usize) -> Self

Sets write batch size (defaults to 1024 via DEFAULT_WRITE_BATCH_SIZE).

For performance reasons, data for each column is written in batches of this size.

Additional limits such as such as set_data_page_row_count_limit are checked between batches, and thus the write batch size value acts as an upper-bound on the enforcement granularity of other limits.

Source

pub fn set_max_row_group_size(self, value: usize) -> Self

Sets maximum number of rows in a row group (defaults to 1024 * 1024 via DEFAULT_MAX_ROW_GROUP_SIZE).

§Panics

If the value is set to 0.

Source

pub fn set_bloom_filter_position(self, value: BloomFilterPosition) -> Self

Sets where in the final file Bloom Filters are written (defaults to AfterRowGroup via DEFAULT_BLOOM_FILTER_POSITION)

Source

pub fn set_created_by(self, value: String) -> Self

Sets “created by” property (defaults to parquet-rs version <VERSION> via DEFAULT_CREATED_BY).

This is a string that will be written into the file metadata

Source

pub fn set_offset_index_disabled(self, value: bool) -> Self

Sets whether the writing of offset indexes is disabled (defaults to false via DEFAULT_OFFSET_INDEX_DISABLED).

If statistics level is set to Page this setting will be overridden with false.

Note: As the offset indexes are useful for accessing data by row number, they are always written by default, regardless of whether other statistics are enabled. Disabling this metadata may result in a degradation in read performance, so use this option with care.

Source

pub fn set_key_value_metadata(self, value: Option<Vec<KeyValue>>) -> Self

Sets “key_value_metadata” property (defaults to None).

Source

pub fn set_sorting_columns(self, value: Option<Vec<SortingColumn>>) -> Self

Sets sorting order of rows in the row group if any (defaults to None).

Source

pub fn set_column_index_truncate_length(self, max_length: Option<usize>) -> Self

Sets the max length of min/max value fields when writing the column Index (defaults to Some(64) via DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH).

This can be used to prevent columns with very long values (hundreds of bytes long) from causing the parquet metadata to become huge.

§Notes

The column Index is written when Self::set_statistics_enabled is set to EnabledStatistics::Page.

If Some, must be greater than 0, otherwise will panic
If None, there’s no effective limit.

Source

pub fn set_statistics_truncate_length(self, max_length: Option<usize>) -> Self

Sets the max length of min/max value fields in row group and data page header Statistics (defaults to Some(64) via DEFAULT_STATISTICS_TRUNCATE_LENGTH).

§Notes

Row group Statistics are written when Self::set_statistics_enabled is set to EnabledStatistics::Chunk or EnabledStatistics::Page. Data page header Statistics are written when Self::set_statistics_enabled is set to EnabledStatistics::Page.

If Some, must be greater than 0, otherwise will panic
If None, there’s no effective limit.

§See also

Truncation of Page Index statistics is controlled separately via WriterPropertiesBuilder::set_column_index_truncate_length

Source

pub fn set_coerce_types(self, coerce_types: bool) -> Self

Should the writer coerce types to parquet native types (defaults to false via DEFAULT_COERCE_TYPES).

Leaving this option the default false will ensure the exact same data written to parquet using this library will be read.

Setting this option to true will result in parquet files that can be read by more readers, but potentially lose information in the process.

Types such as DataType::Date64, which have no direct corresponding Parquet type, may be stored with lower precision.
The internal field names of List and Map types will be renamed if necessary to match what is required by the newest Parquet specification.

See ArrowToParquetSchemaConverter::with_coerce_types for more details

Source

pub fn with_file_encryption_properties( self, file_encryption_properties: Arc<FileEncryptionProperties>, ) -> Self

Sets FileEncryptionProperties (defaults to None)

Source

pub fn set_encoding(self, value: Encoding) -> Self

Sets default encoding for all columns.

If dictionary is not enabled, this is treated as a primary encoding for all columns. In case when dictionary is enabled for any column, this value is considered to be a fallback encoding for that column.

§Panics

if dictionary encoding is specified, regardless of dictionary encoding flag being set.

Source

pub fn set_compression(self, value: Compression) -> Self

Sets default compression codec for all columns (default to UNCOMPRESSED via DEFAULT_COMPRESSION).

Source

pub fn set_dictionary_enabled(self, value: bool) -> Self

Sets default flag to enable/disable dictionary encoding for all columns (defaults to true via DEFAULT_DICTIONARY_ENABLED).

Use this method to set dictionary encoding, instead of explicitly specifying encoding in set_encoding method.

Source

pub fn set_dictionary_page_size_limit(self, value: usize) -> Self

Sets best effort maximum dictionary page size, in bytes (defaults to 1024 * 1024 via DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT).

The parquet writer will attempt to limit the size of each DataPage used to store dictionaries to this many bytes. Reducing this value will result in larger parquet files, but may improve the effectiveness of page index based predicate pushdown during reading.

Note: this is a best effort limit based on value of set_write_batch_size.

Source

pub fn set_statistics_enabled(self, value: EnabledStatistics) -> Self

Sets default EnabledStatistics level for all columns (defaults to Page via DEFAULT_STATISTICS_ENABLED).

Source

pub fn set_write_page_header_statistics(self, value: bool) -> Self

enable/disable writing Statistics in the page header (defaults to false via DEFAULT_WRITE_PAGE_HEADER_STATISTICS).

Only applicable if Page level statistics are gathered.

Setting this value to true can greatly increase the size of the resulting Parquet file while yielding very little added benefit. Most modern Parquet implementations will use the min/max values stored in the ParquetColumnIndex rather than those in the page header.

§Note

Prior to version 56.0.0, the parquet crate always wrote these statistics (the equivalent of setting this option to true). This was changed in 56.0.0 to follow the recommendation in the Parquet specification. See issue #7580 for more details.

Source

pub fn set_bloom_filter_enabled(self, value: bool) -> Self

Sets if bloom filter should be written for all columns (defaults to false).

§Notes

If the bloom filter is enabled previously then it is a no-op.
If the bloom filter is not enabled, default values for ndv and fpp value are used used. See set_bloom_filter_ndv and set_bloom_filter_fpp to further adjust the ndv and fpp.

Source

pub fn set_bloom_filter_fpp(self, value: f64) -> Self

Sets the default target bloom filter false positive probability (fpp) for all columns (defaults to 0.05 via DEFAULT_BLOOM_FILTER_FPP).

Implicitly enables bloom writing, as if set_bloom_filter_enabled had been called.

Source

pub fn set_bloom_filter_ndv(self, value: u64) -> Self

Sets default number of distinct values (ndv) for bloom filter for all columns (defaults to 1_000_000 via DEFAULT_BLOOM_FILTER_NDV).

Implicitly enables bloom writing, as if set_bloom_filter_enabled had been called.

Source

fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties

Helper method to get existing or new mutable reference of column properties.

Source

pub fn set_column_encoding(self, col: ColumnPath, value: Encoding) -> Self

Sets encoding for a specific column.

Takes precedence over Self::set_encoding.

If dictionary is not enabled, this is treated as a primary encoding for this column. In case when dictionary is enabled for this column, either through global defaults or explicitly, this value is considered to be a fallback encoding for this column.

§Panics

If user tries to set dictionary encoding here, regardless of dictionary encoding flag being set.

Source

pub fn set_column_compression(self, col: ColumnPath, value: Compression) -> Self

Sets compression codec for a specific column.

Takes precedence over Self::set_compression.

Source

pub fn set_column_dictionary_enabled(self, col: ColumnPath, value: bool) -> Self

Sets flag to enable/disable dictionary encoding for a specific column.

Takes precedence over Self::set_dictionary_enabled.

Source

pub fn set_column_dictionary_page_size_limit( self, col: ColumnPath, value: usize, ) -> Self

Sets dictionary page size limit for a specific column.

Takes precedence over Self::set_dictionary_page_size_limit.

Source

pub fn set_column_statistics_enabled( self, col: ColumnPath, value: EnabledStatistics, ) -> Self

Sets EnabledStatistics level for a specific column.

Takes precedence over Self::set_statistics_enabled.

Source

pub fn set_column_write_page_header_statistics( self, col: ColumnPath, value: bool, ) -> Self

Sets whether to write Statistics in the page header for a specific column.

Takes precedence over Self::set_write_page_header_statistics.

Source

pub fn set_column_bloom_filter_enabled( self, col: ColumnPath, value: bool, ) -> Self

Sets whether a bloom filter should be written for a specific column.

Takes precedence over Self::set_bloom_filter_enabled.

Source

pub fn set_column_bloom_filter_fpp(self, col: ColumnPath, value: f64) -> Self

Sets the false positive probability for bloom filter for a specific column.

Takes precedence over Self::set_bloom_filter_fpp.

Source

pub fn set_column_bloom_filter_ndv(self, col: ColumnPath, value: u64) -> Self

Sets the number of distinct values for bloom filter for a specific column.

Takes precedence over Self::set_bloom_filter_ndv.

Trait Implementations§

Source §

impl Clone for WriterPropertiesBuilder

Source §

fn clone(&self) -> WriterPropertiesBuilder

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for WriterPropertiesBuilder

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for WriterPropertiesBuilder

Source §

fn default() -> Self

Returns default state of the builder.

Source §

impl From<WriterProperties> for WriterPropertiesBuilder

Source §

fn from(props: WriterProperties) -> Self

Converts to this type from the input type.

Auto Trait Implementations§

§

impl UnwindSafe for WriterPropertiesBuilder

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

impl<T> ToOwned for T
where T: Clone,

Source §

type Owned = T

The resulting type after obtaining ownership.

Source §

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more

Source §

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

§

fn vzip(self) -> V

§

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,

§

Struct WriterPropertiesBuilder Copy item path

Fields§

Implementations§

impl WriterPropertiesBuilder

pub fn build(self) -> WriterProperties

pub fn set_writer_version(self, value: WriterVersion) -> Self

pub fn set_data_page_size_limit(self, value: usize) -> Self

pub fn set_data_page_row_count_limit(self, value: usize) -> Self

pub fn set_write_batch_size(self, value: usize) -> Self

pub fn set_max_row_group_size(self, value: usize) -> Self

§Panics

pub fn set_bloom_filter_position(self, value: BloomFilterPosition) -> Self

pub fn set_created_by(self, value: String) -> Self

pub fn set_offset_index_disabled(self, value: bool) -> Self

pub fn set_key_value_metadata(self, value: Option<Vec<KeyValue>>) -> Self

pub fn set_sorting_columns(self, value: Option<Vec<SortingColumn>>) -> Self

pub fn set_column_index_truncate_length(self, max_length: Option<usize>) -> Self

§Notes

pub fn set_statistics_truncate_length(self, max_length: Option<usize>) -> Self

§Notes

§See also

pub fn set_coerce_types(self, coerce_types: bool) -> Self

pub fn with_file_encryption_properties( self, file_encryption_properties: Arc<FileEncryptionProperties>, ) -> Self

pub fn set_encoding(self, value: Encoding) -> Self

§Panics

pub fn set_compression(self, value: Compression) -> Self

pub fn set_dictionary_enabled(self, value: bool) -> Self

pub fn set_dictionary_page_size_limit(self, value: usize) -> Self

pub fn set_statistics_enabled(self, value: EnabledStatistics) -> Self

pub fn set_write_page_header_statistics(self, value: bool) -> Self

§Note

pub fn set_bloom_filter_enabled(self, value: bool) -> Self

§Notes

pub fn set_bloom_filter_fpp(self, value: f64) -> Self

pub fn set_bloom_filter_ndv(self, value: u64) -> Self

fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties

pub fn set_column_encoding(self, col: ColumnPath, value: Encoding) -> Self

§Panics

pub fn set_column_compression(self, col: ColumnPath, value: Compression) -> Self

pub fn set_column_dictionary_enabled(self, col: ColumnPath, value: bool) -> Self

pub fn set_column_dictionary_page_size_limit( self, col: ColumnPath, value: usize, ) -> Self

pub fn set_column_statistics_enabled( self, col: ColumnPath, value: EnabledStatistics, ) -> Self

pub fn set_column_write_page_header_statistics( self, col: ColumnPath, value: bool, ) -> Self

pub fn set_column_bloom_filter_enabled( self, col: ColumnPath, value: bool, ) -> Self

pub fn set_column_bloom_filter_fpp(self, col: ColumnPath, value: f64) -> Self

pub fn set_column_bloom_filter_ndv(self, col: ColumnPath, value: u64) -> Self

Trait Implementations§

impl Clone for WriterPropertiesBuilder

fn clone(&self) -> WriterPropertiesBuilder

fn clone_from(&mut self, source: &Self)

impl Debug for WriterPropertiesBuilder

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for WriterPropertiesBuilder

fn default() -> Self

impl From<WriterProperties> for WriterPropertiesBuilder

fn from(props: WriterProperties) -> Self

Auto Trait Implementations§

impl Freeze for WriterPropertiesBuilder

impl RefUnwindSafe for WriterPropertiesBuilder

impl Send for WriterPropertiesBuilder

impl Sync for WriterPropertiesBuilder

impl Unpin for WriterPropertiesBuilder

impl UnwindSafe for WriterPropertiesBuilder

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for Twhere T: Clone,

Struct WriterPropertiesBuilder

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,

impl<T> Ungil for T
where T: Send,