parquet::column::writer::byte_budget_chunker

Struct ByteBudgetChunker

pub(crate) struct ByteBudgetChunker {
    page_byte_limit: usize,
    max_def_level: i16,
    static_always_fits: bool,
    dict_page_byte_limit: usize,
    static_dict_always_fits: bool,
}

Expand description

Picks byte-budget-aware mini-batch sizes for one column.

The parquet column writer checks the data page byte limit only after each mini-batch finishes writing. Mini-batches are sized in rows (write_batch_size, default 1024), so for BYTE_ARRAY columns whose values are large (e.g. multi-MiB blobs) a single mini-batch can buffer GiB into one page before the limit is consulted.

This isolates the per-chunk decision that prevents that: given a chunk’s level data and the input values, pick the largest sub_batch_size such that one mini-batch fits in one page byte budget. For the overwhelmingly common case (small or fixed-width values) the answer is just chunk_size and the decision is O(1) on the column type — only when the input might overflow does the chunker consult the encoder’s byte estimate.

Fields§

§page_byte_limit: usize

Configured data page byte limit for the column.

§max_def_level: i16

Max definition level of the column; a level equal to this marks a present (non-null) leaf value. Used to count values per chunk.

§static_always_fits: bool

true when no chunk of base_batch_size values can ever overflow page_byte_limit regardless of input. Set once at column open from the physical type’s known per-value byte size; lets the per-chunk decision short-circuit with no work for every numeric, bool, or narrow FIXED_LEN_BYTE_ARRAY column.

§dict_page_byte_limit: usize

Configured dictionary page byte limit for the column.

§static_dict_always_fits: bool

As Self::static_always_fits but for the dictionary page: true when one base_batch_size mini-batch of this fixed-width type cannot overshoot dict_page_byte_limit by more than one mini-batch’s worth.

Struct ByteBudgetChunker Copy item path

Fields§

Implementations§

impl ByteBudgetChunker

pub(crate) fn new( descr: &ColumnDescriptor, props: &WriterProperties, base_batch_size: usize, ) -> Self

pub(crate) fn pick_sub_batch_size<E: ColumnValueEncoder>( &self, encoder: &E, values: &E::Values, value_indices: Option<&[usize]>, chunk_def: LevelDataRef<'_>, values_offset: usize, chunk_size: usize, ) -> usize

fn byte_budget_sub_batch_size<E: ColumnValueEncoder>( &self, values: &E::Values, value_indices: Option<&[usize]>, chunk_def: LevelDataRef<'_>, values_offset: usize, chunk_size: usize, budget: usize, ) -> usize

Auto Trait Implementations§

impl Freeze for ByteBudgetChunker

impl RefUnwindSafe for ByteBudgetChunker

impl Send for ByteBudgetChunker

impl Sync for ByteBudgetChunker

impl Unpin for ByteBudgetChunker

impl UnsafeUnpin for ByteBudgetChunker

impl UnwindSafe for ByteBudgetChunker

Blanket Implementations§

impl<T> Allocation for Twhere T: RefUnwindSafe + Send + Sync,

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> Ungil for Twhere T: Send,

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

Struct ByteBudgetChunker

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<T> Ungil for T
where T: Send,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,