parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Most users should use these structures to interact with Parquet metadata.
21//! The [crate::format] module contains lower level structures generated from the
22//! Parquet thrift definition.
23//!
24//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
25//!   file footer.
26//!
27//! * [`FileMetaData`]: File level metadata such as schema, row counts and
28//!   version.
29//!
30//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
31//!   location and number of rows, and column chunks.
32//!
33//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
34//!   within a Row Group including encoding and compression information,
35//!   number of values, statistics, etc.
36//!
37//! # APIs for working with Parquet Metadata
38//!
39//! The Parquet readers and writers in this crate handle reading and writing
40//! metadata into parquet files. To work with metadata directly,
41//! the following APIs are available:
42//!
43//! * [`ParquetMetaDataReader`] for reading
44//! * [`ParquetMetaDataWriter`] for writing.
45//!
46//! [`ParquetMetaDataReader`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html
47//! [`ParquetMetaDataWriter`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataWriter.html
48//!
49//! # Examples
50//!
51//! Please see [`external_metadata.rs`]
52//!
53//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
54//!
55//! # Metadata Encodings and Structures
56//!
57//! There are three different encodings of Parquet Metadata in this crate:
58//!
59//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
60//!    [parquet.thrift]
61//!
62//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
63//!    from [parquet.thrift]. These structures are low level and mirror
64//!    the thrift definitions.
65//!
66//! 3. [`file::metadata`] (this module): Easier to use Rust structures
67//!    with a more idiomatic API. Note that, confusingly, some but not all
68//!    of these structures have the same name as the [`format`] structures.
69//!
70//! [`format`]: crate::format
71//! [`file::metadata`]: crate::file::metadata
72//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
73//!
74//! Graphically, this is how the different structures relate to each other:
75//!
76//! ```text
77//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
78//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
79//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
80//!                            └──────────────┘     │         └───────────────────────┘ │
81//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
82//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
83//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
84//!                                     ...         │                   ...             │
85//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
86//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
87//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
88//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
89//!
90//!                          format::meta structures          file::metadata structures
91//!
92//!                         * Same name, different struct
93//! ```
94mod memory;
95pub(crate) mod reader;
96mod writer;
97
98use std::ops::Range;
99use std::sync::Arc;
100
101use crate::format::{
102    BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
103    SizeStatistics, SortingColumn,
104};
105
106use crate::basic::{ColumnOrder, Compression, Encoding, Type};
107use crate::errors::{ParquetError, Result};
108pub(crate) use crate::file::metadata::memory::HeapSize;
109use crate::file::page_encoding_stats::{self, PageEncodingStats};
110use crate::file::page_index::index::Index;
111use crate::file::page_index::offset_index::OffsetIndexMetaData;
112use crate::file::statistics::{self, Statistics};
113use crate::schema::types::{
114    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
115    Type as SchemaType,
116};
117pub use reader::ParquetMetaDataReader;
118pub use writer::ParquetMetaDataWriter;
119pub(crate) use writer::ThriftMetadataWriter;
120
121/// Page level statistics for each column chunk of each row group.
122///
123/// This structure is an in-memory representation of multiple [`ColumnIndex`]
124/// structures in a parquet file footer, as described in the Parquet [PageIndex
125/// documentation]. Each [`Index`] holds statistics about all the pages in a
126/// particular column chunk.
127///
128/// `column_index[row_group_number][column_number]` holds the
129/// [`Index`] corresponding to column `column_number` of row group
130/// `row_group_number`.
131///
132/// For example `column_index[2][3]` holds the [`Index`] for the fourth
133/// column in the third row group of the parquet file.
134///
135/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
136pub type ParquetColumnIndex = Vec<Vec<Index>>;
137
138/// [`OffsetIndexMetaData`] for each data page of each row group of each column
139///
140/// This structure is the parsed representation of the [`OffsetIndex`] from the
141/// Parquet file footer, as described in the Parquet [PageIndex documentation].
142///
143/// `offset_index[row_group_number][column_number]` holds
144/// the [`OffsetIndexMetaData`] corresponding to column
145/// `column_number`of row group `row_group_number`.
146///
147/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
148pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
149
150/// Parsed metadata for a single Parquet file
151///
152/// This structure is stored in the footer of Parquet files, in the format
153/// defined by [`parquet.thrift`].
154///
155/// # Overview
156/// The fields of this structure are:
157/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
158/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
159/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
160///
161/// This structure is read by the various readers in this crate or can be read
162/// directly from a file using the [`ParquetMetaDataReader`] struct.
163///
164/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
165///
166/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
167#[derive(Debug, Clone, PartialEq)]
168pub struct ParquetMetaData {
169    /// File level metadata
170    file_metadata: FileMetaData,
171    /// Row group metadata
172    row_groups: Vec<RowGroupMetaData>,
173    /// Page level index for each page in each column chunk
174    column_index: Option<ParquetColumnIndex>,
175    /// Offset index for each page in each column chunk
176    offset_index: Option<ParquetOffsetIndex>,
177}
178
179impl ParquetMetaData {
180    /// Creates Parquet metadata from file metadata and a list of row
181    /// group metadata
182    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
183        ParquetMetaData {
184            file_metadata,
185            row_groups,
186            column_index: None,
187            offset_index: None,
188        }
189    }
190
191    /// Creates Parquet metadata from file metadata, a list of row
192    /// group metadata, and the column index structures.
193    #[deprecated(since = "53.1.0", note = "Use ParquetMetaDataBuilder")]
194    pub fn new_with_page_index(
195        file_metadata: FileMetaData,
196        row_groups: Vec<RowGroupMetaData>,
197        column_index: Option<ParquetColumnIndex>,
198        offset_index: Option<ParquetOffsetIndex>,
199    ) -> Self {
200        ParquetMetaDataBuilder::new(file_metadata)
201            .set_row_groups(row_groups)
202            .set_column_index(column_index)
203            .set_offset_index(offset_index)
204            .build()
205    }
206
207    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
208    pub fn into_builder(self) -> ParquetMetaDataBuilder {
209        self.into()
210    }
211
212    /// Returns file metadata as reference.
213    pub fn file_metadata(&self) -> &FileMetaData {
214        &self.file_metadata
215    }
216
217    /// Returns number of row groups in this file.
218    pub fn num_row_groups(&self) -> usize {
219        self.row_groups.len()
220    }
221
222    /// Returns row group metadata for `i`th position.
223    /// Position should be less than number of row groups `num_row_groups`.
224    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
225        &self.row_groups[i]
226    }
227
228    /// Returns slice of row groups in this file.
229    pub fn row_groups(&self) -> &[RowGroupMetaData] {
230        &self.row_groups
231    }
232
233    /// Returns the column index for this file if loaded
234    ///
235    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
236    /// [ArrowReaderOptions::with_page_index] was set to false.
237    ///
238    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
239    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
240        self.column_index.as_ref()
241    }
242
243    /// Returns offset indexes in this file, if loaded
244    ///
245    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
246    /// [ArrowReaderOptions::with_page_index] was set to false.
247    ///
248    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
249    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
250        self.offset_index.as_ref()
251    }
252
253    /// Estimate of the bytes allocated to store `ParquetMetadata`
254    ///
255    /// # Notes:
256    ///
257    /// 1. Includes size of self
258    ///
259    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
260    ///    [`RowGroupMetaData`].
261    ///
262    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
263    ///    means `memory_size` will over estimate the memory size if such pointers
264    ///    are shared.
265    ///
266    /// 4. Does not include any allocator overheads
267    pub fn memory_size(&self) -> usize {
268        std::mem::size_of::<Self>()
269            + self.file_metadata.heap_size()
270            + self.row_groups.heap_size()
271            + self.column_index.heap_size()
272            + self.offset_index.heap_size()
273    }
274
275    /// Override the column index
276    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
277        self.column_index = index;
278    }
279
280    /// Override the offset index
281    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
282        self.offset_index = index;
283    }
284}
285
286/// A builder for creating / manipulating [`ParquetMetaData`]
287///
288/// # Example creating a new [`ParquetMetaData`]
289///
290///```no_run
291/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
292/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
293/// // Create a new builder given the file metadata
294/// let file_metadata = get_file_metadata();
295/// // Create a row group
296/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
297///    .set_num_rows(100)
298///    // ... (A real row group needs more than just the number of rows)
299///    .build()
300///    .unwrap();
301/// // Create the final metadata
302/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
303///   .add_row_group(row_group)
304///   .build();
305/// ```
306///
307/// # Example modifying an existing [`ParquetMetaData`]
308/// ```no_run
309/// # use parquet::file::metadata::ParquetMetaData;
310/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
311/// // Modify the metadata so only the last RowGroup remains
312/// let metadata: ParquetMetaData = load_metadata();
313/// let mut builder = metadata.into_builder();
314///
315/// // Take existing row groups to modify
316/// let mut row_groups = builder.take_row_groups();
317/// let last_row_group = row_groups.pop().unwrap();
318///
319/// let metadata = builder
320///   .add_row_group(last_row_group)
321///   .build();
322/// ```
323pub struct ParquetMetaDataBuilder(ParquetMetaData);
324
325impl ParquetMetaDataBuilder {
326    /// Create a new builder from a file metadata, with no row groups
327    pub fn new(file_meta_data: FileMetaData) -> Self {
328        Self(ParquetMetaData::new(file_meta_data, vec![]))
329    }
330
331    /// Create a new builder from an existing ParquetMetaData
332    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
333        Self(metadata)
334    }
335
336    /// Adds a row group to the metadata
337    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
338        self.0.row_groups.push(row_group);
339        self
340    }
341
342    /// Sets all the row groups to the specified list
343    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
344        self.0.row_groups = row_groups;
345        self
346    }
347
348    /// Takes ownership of the row groups in this builder, and clears the list
349    /// of row groups.
350    ///
351    /// This can be used for more efficient creation of a new ParquetMetaData
352    /// from an existing one.
353    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
354        std::mem::take(&mut self.0.row_groups)
355    }
356
357    /// Return a reference to the current row groups
358    pub fn row_groups(&self) -> &[RowGroupMetaData] {
359        &self.0.row_groups
360    }
361
362    /// Sets the column index
363    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
364        self.0.column_index = column_index;
365        self
366    }
367
368    /// Returns the current column index from the builder, replacing it with `None`
369    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
370        std::mem::take(&mut self.0.column_index)
371    }
372
373    /// Return a reference to the current column index, if any
374    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
375        self.0.column_index.as_ref()
376    }
377
378    /// Sets the offset index
379    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
380        self.0.offset_index = offset_index;
381        self
382    }
383
384    /// Returns the current offset index from the builder, replacing it with `None`
385    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
386        std::mem::take(&mut self.0.offset_index)
387    }
388
389    /// Return a reference to the current offset index, if any
390    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
391        self.0.offset_index.as_ref()
392    }
393
394    /// Creates a new ParquetMetaData from the builder
395    pub fn build(self) -> ParquetMetaData {
396        let Self(metadata) = self;
397        metadata
398    }
399}
400
401impl From<ParquetMetaData> for ParquetMetaDataBuilder {
402    fn from(meta_data: ParquetMetaData) -> Self {
403        Self(meta_data)
404    }
405}
406
407/// A key-value pair for [`FileMetaData`].
408pub type KeyValue = crate::format::KeyValue;
409
410/// Reference counted pointer for [`FileMetaData`].
411pub type FileMetaDataPtr = Arc<FileMetaData>;
412
413/// File level metadata for a Parquet file.
414///
415/// Includes the version of the file, metadata, number of rows, schema, and column orders
416#[derive(Debug, Clone, PartialEq)]
417pub struct FileMetaData {
418    version: i32,
419    num_rows: i64,
420    created_by: Option<String>,
421    key_value_metadata: Option<Vec<KeyValue>>,
422    schema_descr: SchemaDescPtr,
423    column_orders: Option<Vec<ColumnOrder>>,
424}
425
426impl FileMetaData {
427    /// Creates new file metadata.
428    pub fn new(
429        version: i32,
430        num_rows: i64,
431        created_by: Option<String>,
432        key_value_metadata: Option<Vec<KeyValue>>,
433        schema_descr: SchemaDescPtr,
434        column_orders: Option<Vec<ColumnOrder>>,
435    ) -> Self {
436        FileMetaData {
437            version,
438            num_rows,
439            created_by,
440            key_value_metadata,
441            schema_descr,
442            column_orders,
443        }
444    }
445
446    /// Returns version of this file.
447    pub fn version(&self) -> i32 {
448        self.version
449    }
450
451    /// Returns number of rows in the file.
452    pub fn num_rows(&self) -> i64 {
453        self.num_rows
454    }
455
456    /// String message for application that wrote this file.
457    ///
458    /// This should have the following format:
459    /// `<application> version <application version> (build <application build hash>)`.
460    ///
461    /// ```shell
462    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
463    /// ```
464    pub fn created_by(&self) -> Option<&str> {
465        self.created_by.as_deref()
466    }
467
468    /// Returns key_value_metadata of this file.
469    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
470        self.key_value_metadata.as_ref()
471    }
472
473    /// Returns Parquet [`Type`] that describes schema in this file.
474    ///
475    /// [`Type`]: crate::schema::types::Type
476    pub fn schema(&self) -> &SchemaType {
477        self.schema_descr.root_schema()
478    }
479
480    /// Returns a reference to schema descriptor.
481    pub fn schema_descr(&self) -> &SchemaDescriptor {
482        &self.schema_descr
483    }
484
485    /// Returns reference counted clone for schema descriptor.
486    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
487        self.schema_descr.clone()
488    }
489
490    /// Column (sort) order used for `min` and `max` values of each column in this file.
491    ///
492    /// Each column order corresponds to one column, determined by its position in the
493    /// list, matching the position of the column in the schema.
494    ///
495    /// When `None` is returned, there are no column orders available, and each column
496    /// should be assumed to have undefined (legacy) column order.
497    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
498        self.column_orders.as_ref()
499    }
500
501    /// Returns column order for `i`th column in this file.
502    /// If column orders are not available, returns undefined (legacy) column order.
503    pub fn column_order(&self, i: usize) -> ColumnOrder {
504        self.column_orders
505            .as_ref()
506            .map(|data| data[i])
507            .unwrap_or(ColumnOrder::UNDEFINED)
508    }
509}
510
511/// Reference counted pointer for [`RowGroupMetaData`].
512pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
513
514/// Metadata for a row group
515///
516/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
517/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
518#[derive(Debug, Clone, PartialEq)]
519pub struct RowGroupMetaData {
520    columns: Vec<ColumnChunkMetaData>,
521    num_rows: i64,
522    sorting_columns: Option<Vec<SortingColumn>>,
523    total_byte_size: i64,
524    schema_descr: SchemaDescPtr,
525    /// We can't infer from file offset of first column since there may empty columns in row group.
526    file_offset: Option<i64>,
527    /// Ordinal position of this row group in file
528    ordinal: Option<i16>,
529}
530
531impl RowGroupMetaData {
532    /// Returns builder for row group metadata.
533    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
534        RowGroupMetaDataBuilder::new(schema_descr)
535    }
536
537    /// Number of columns in this row group.
538    pub fn num_columns(&self) -> usize {
539        self.columns.len()
540    }
541
542    /// Returns column chunk metadata for `i`th column.
543    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
544        &self.columns[i]
545    }
546
547    /// Returns slice of column chunk metadata.
548    pub fn columns(&self) -> &[ColumnChunkMetaData] {
549        &self.columns
550    }
551
552    /// Returns mutable slice of column chunk metadata.
553    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
554        &mut self.columns
555    }
556
557    /// Number of rows in this row group.
558    pub fn num_rows(&self) -> i64 {
559        self.num_rows
560    }
561
562    /// Returns the sort ordering of the rows in this RowGroup if any
563    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
564        self.sorting_columns.as_ref()
565    }
566
567    /// Total byte size of all uncompressed column data in this row group.
568    pub fn total_byte_size(&self) -> i64 {
569        self.total_byte_size
570    }
571
572    /// Total size of all compressed column data in this row group.
573    pub fn compressed_size(&self) -> i64 {
574        self.columns.iter().map(|c| c.total_compressed_size).sum()
575    }
576
577    /// Returns reference to a schema descriptor.
578    pub fn schema_descr(&self) -> &SchemaDescriptor {
579        self.schema_descr.as_ref()
580    }
581
582    /// Returns reference counted clone of schema descriptor.
583    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
584        self.schema_descr.clone()
585    }
586
587    /// Returns ordinal position of this row group in file.
588    ///
589    /// For example if this is the first row group in the file, this will return 0.
590    /// If this is the second row group in the file, this will return 1.
591    #[inline(always)]
592    pub fn ordinal(&self) -> Option<i16> {
593        self.ordinal
594    }
595
596    /// Returns file offset of this row group in file.
597    #[inline(always)]
598    pub fn file_offset(&self) -> Option<i64> {
599        self.file_offset
600    }
601
602    /// Method to convert from Thrift.
603    pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
604        if schema_descr.num_columns() != rg.columns.len() {
605            return Err(general_err!(
606                "Column count mismatch. Schema has {} columns while Row Group has {}",
607                schema_descr.num_columns(),
608                rg.columns.len()
609            ));
610        }
611        let total_byte_size = rg.total_byte_size;
612        let num_rows = rg.num_rows;
613        let mut columns = vec![];
614        for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
615            let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?;
616            columns.push(cc);
617        }
618        let sorting_columns = rg.sorting_columns;
619        Ok(RowGroupMetaData {
620            columns,
621            num_rows,
622            sorting_columns,
623            total_byte_size,
624            schema_descr,
625            file_offset: rg.file_offset,
626            ordinal: rg.ordinal,
627        })
628    }
629
630    /// Method to convert to Thrift.
631    pub fn to_thrift(&self) -> RowGroup {
632        RowGroup {
633            columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
634            total_byte_size: self.total_byte_size,
635            num_rows: self.num_rows,
636            sorting_columns: self.sorting_columns().cloned(),
637            file_offset: self.file_offset(),
638            total_compressed_size: Some(self.compressed_size()),
639            ordinal: self.ordinal,
640        }
641    }
642
643    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
644    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
645        RowGroupMetaDataBuilder(self)
646    }
647}
648
649/// Builder for row group metadata.
650pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
651
652impl RowGroupMetaDataBuilder {
653    /// Creates new builder from schema descriptor.
654    fn new(schema_descr: SchemaDescPtr) -> Self {
655        Self(RowGroupMetaData {
656            columns: Vec::with_capacity(schema_descr.num_columns()),
657            schema_descr,
658            file_offset: None,
659            num_rows: 0,
660            sorting_columns: None,
661            total_byte_size: 0,
662            ordinal: None,
663        })
664    }
665
666    /// Sets number of rows in this row group.
667    pub fn set_num_rows(mut self, value: i64) -> Self {
668        self.0.num_rows = value;
669        self
670    }
671
672    /// Sets the sorting order for columns
673    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
674        self.0.sorting_columns = value;
675        self
676    }
677
678    /// Sets total size in bytes for this row group.
679    pub fn set_total_byte_size(mut self, value: i64) -> Self {
680        self.0.total_byte_size = value;
681        self
682    }
683
684    /// Takes ownership of the the column metadata in this builder, and clears
685    /// the list of columns.
686    ///
687    /// This can be used for more efficient creation of a new RowGroupMetaData
688    /// from an existing one.
689    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
690        std::mem::take(&mut self.0.columns)
691    }
692
693    /// Sets column metadata for this row group.
694    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
695        self.0.columns = value;
696        self
697    }
698
699    /// Adds a column metadata to this row group
700    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
701        self.0.columns.push(value);
702        self
703    }
704
705    /// Sets ordinal for this row group.
706    pub fn set_ordinal(mut self, value: i16) -> Self {
707        self.0.ordinal = Some(value);
708        self
709    }
710
711    /// Sets file offset for this row group.
712    pub fn set_file_offset(mut self, value: i64) -> Self {
713        self.0.file_offset = Some(value);
714        self
715    }
716
717    /// Builds row group metadata.
718    pub fn build(self) -> Result<RowGroupMetaData> {
719        if self.0.schema_descr.num_columns() != self.0.columns.len() {
720            return Err(general_err!(
721                "Column length mismatch: {} != {}",
722                self.0.schema_descr.num_columns(),
723                self.0.columns.len()
724            ));
725        }
726
727        Ok(self.0)
728    }
729}
730
731/// Metadata for a column chunk.
732#[derive(Debug, Clone, PartialEq)]
733pub struct ColumnChunkMetaData {
734    column_descr: ColumnDescPtr,
735    encodings: Vec<Encoding>,
736    file_path: Option<String>,
737    file_offset: i64,
738    num_values: i64,
739    compression: Compression,
740    total_compressed_size: i64,
741    total_uncompressed_size: i64,
742    data_page_offset: i64,
743    index_page_offset: Option<i64>,
744    dictionary_page_offset: Option<i64>,
745    statistics: Option<Statistics>,
746    encoding_stats: Option<Vec<PageEncodingStats>>,
747    bloom_filter_offset: Option<i64>,
748    bloom_filter_length: Option<i32>,
749    offset_index_offset: Option<i64>,
750    offset_index_length: Option<i32>,
751    column_index_offset: Option<i64>,
752    column_index_length: Option<i32>,
753    unencoded_byte_array_data_bytes: Option<i64>,
754    repetition_level_histogram: Option<LevelHistogram>,
755    definition_level_histogram: Option<LevelHistogram>,
756}
757
758/// Histograms for repetition and definition levels.
759///
760/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
761/// values at level `i`.
762///
763/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
764/// number of rows with level 1, and so on.
765///
766#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
767pub struct LevelHistogram {
768    inner: Vec<i64>,
769}
770
771impl LevelHistogram {
772    /// Creates a new level histogram data.
773    ///
774    /// Length will be `max_level + 1`.
775    ///
776    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
777    pub fn try_new(max_level: i16) -> Option<Self> {
778        if max_level > 0 {
779            Some(Self {
780                inner: vec![0; max_level as usize + 1],
781            })
782        } else {
783            None
784        }
785    }
786    /// Returns a reference to the the histogram's values.
787    pub fn values(&self) -> &[i64] {
788        &self.inner
789    }
790
791    /// Return the inner vector, consuming self
792    pub fn into_inner(self) -> Vec<i64> {
793        self.inner
794    }
795
796    /// Returns the histogram value at the given index.
797    ///
798    /// The value of `i` is the number of values with level `i`. For example,
799    /// `get(1)` returns the number of values with level 1.
800    ///
801    /// Returns `None` if the index is out of bounds.
802    pub fn get(&self, index: usize) -> Option<i64> {
803        self.inner.get(index).copied()
804    }
805
806    /// Adds the values from the other histogram to this histogram
807    ///
808    /// # Panics
809    /// If the histograms have different lengths
810    pub fn add(&mut self, other: &Self) {
811        assert_eq!(self.len(), other.len());
812        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
813            *dst += src;
814        }
815    }
816
817    /// return the length of the histogram
818    pub fn len(&self) -> usize {
819        self.inner.len()
820    }
821
822    /// returns if the histogram is empty
823    pub fn is_empty(&self) -> bool {
824        self.inner.is_empty()
825    }
826
827    /// Sets the values of all histogram levels to 0.
828    pub fn reset(&mut self) {
829        for value in self.inner.iter_mut() {
830            *value = 0;
831        }
832    }
833
834    /// Updates histogram values using provided repetition levels
835    ///
836    /// # Panics
837    /// if any of the levels is greater than the length of the histogram (
838    /// the argument supplied to [`Self::try_new`])
839    pub fn update_from_levels(&mut self, levels: &[i16]) {
840        for &level in levels {
841            self.inner[level as usize] += 1;
842        }
843    }
844}
845
846impl From<Vec<i64>> for LevelHistogram {
847    fn from(inner: Vec<i64>) -> Self {
848        Self { inner }
849    }
850}
851
852impl From<LevelHistogram> for Vec<i64> {
853    fn from(value: LevelHistogram) -> Self {
854        value.into_inner()
855    }
856}
857
858impl HeapSize for LevelHistogram {
859    fn heap_size(&self) -> usize {
860        self.inner.heap_size()
861    }
862}
863
864/// Represents common operations for a column chunk.
865impl ColumnChunkMetaData {
866    /// Returns builder for column chunk metadata.
867    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
868        ColumnChunkMetaDataBuilder::new(column_descr)
869    }
870
871    /// File where the column chunk is stored.
872    ///
873    /// If not set, assumed to belong to the same file as the metadata.
874    /// This path is relative to the current file.
875    pub fn file_path(&self) -> Option<&str> {
876        self.file_path.as_deref()
877    }
878
879    /// Byte offset of `ColumnMetaData` in `file_path()`.
880    ///
881    /// Note that the meaning of this field has been inconsistent between implementations
882    /// so its use has since been deprecated in the Parquet specification. Modern implementations
883    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
884    /// `ColumnChunk` struct.
885    pub fn file_offset(&self) -> i64 {
886        self.file_offset
887    }
888
889    /// Type of this column. Must be primitive.
890    pub fn column_type(&self) -> Type {
891        self.column_descr.physical_type()
892    }
893
894    /// Path (or identifier) of this column.
895    pub fn column_path(&self) -> &ColumnPath {
896        self.column_descr.path()
897    }
898
899    /// Descriptor for this column.
900    pub fn column_descr(&self) -> &ColumnDescriptor {
901        self.column_descr.as_ref()
902    }
903
904    /// Reference counted clone of descriptor for this column.
905    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
906        self.column_descr.clone()
907    }
908
909    /// All encodings used for this column.
910    pub fn encodings(&self) -> &Vec<Encoding> {
911        &self.encodings
912    }
913
914    /// Total number of values in this column chunk.
915    pub fn num_values(&self) -> i64 {
916        self.num_values
917    }
918
919    /// Compression for this column.
920    pub fn compression(&self) -> Compression {
921        self.compression
922    }
923
924    /// Returns the total compressed data size of this column chunk.
925    pub fn compressed_size(&self) -> i64 {
926        self.total_compressed_size
927    }
928
929    /// Returns the total uncompressed data size of this column chunk.
930    pub fn uncompressed_size(&self) -> i64 {
931        self.total_uncompressed_size
932    }
933
934    /// Returns the offset for the column data.
935    pub fn data_page_offset(&self) -> i64 {
936        self.data_page_offset
937    }
938
939    /// Returns the offset for the index page.
940    pub fn index_page_offset(&self) -> Option<i64> {
941        self.index_page_offset
942    }
943
944    /// Returns the offset for the dictionary page, if any.
945    pub fn dictionary_page_offset(&self) -> Option<i64> {
946        self.dictionary_page_offset
947    }
948
949    /// Returns the offset and length in bytes of the column chunk within the file
950    pub fn byte_range(&self) -> (u64, u64) {
951        let col_start = match self.dictionary_page_offset() {
952            Some(dictionary_page_offset) => dictionary_page_offset,
953            None => self.data_page_offset(),
954        };
955        let col_len = self.compressed_size();
956        assert!(
957            col_start >= 0 && col_len >= 0,
958            "column start and length should not be negative"
959        );
960        (col_start as u64, col_len as u64)
961    }
962
963    /// Returns statistics that are set for this column chunk,
964    /// or `None` if no statistics are available.
965    pub fn statistics(&self) -> Option<&Statistics> {
966        self.statistics.as_ref()
967    }
968
969    /// Returns the offset for the page encoding stats,
970    /// or `None` if no page encoding stats are available.
971    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
972        self.encoding_stats.as_ref()
973    }
974
975    /// Returns the offset for the bloom filter.
976    pub fn bloom_filter_offset(&self) -> Option<i64> {
977        self.bloom_filter_offset
978    }
979
980    /// Returns the offset for the bloom filter.
981    pub fn bloom_filter_length(&self) -> Option<i32> {
982        self.bloom_filter_length
983    }
984
985    /// Returns the offset for the column index.
986    pub fn column_index_offset(&self) -> Option<i64> {
987        self.column_index_offset
988    }
989
990    /// Returns the offset for the column index length.
991    pub fn column_index_length(&self) -> Option<i32> {
992        self.column_index_length
993    }
994
995    /// Returns the range for the offset index if any
996    pub(crate) fn column_index_range(&self) -> Option<Range<usize>> {
997        let offset = usize::try_from(self.column_index_offset?).ok()?;
998        let length = usize::try_from(self.column_index_length?).ok()?;
999        Some(offset..(offset + length))
1000    }
1001
1002    /// Returns the offset for the offset index.
1003    pub fn offset_index_offset(&self) -> Option<i64> {
1004        self.offset_index_offset
1005    }
1006
1007    /// Returns the offset for the offset index length.
1008    pub fn offset_index_length(&self) -> Option<i32> {
1009        self.offset_index_length
1010    }
1011
1012    /// Returns the range for the offset index if any
1013    pub(crate) fn offset_index_range(&self) -> Option<Range<usize>> {
1014        let offset = usize::try_from(self.offset_index_offset?).ok()?;
1015        let length = usize::try_from(self.offset_index_length?).ok()?;
1016        Some(offset..(offset + length))
1017    }
1018
1019    /// Returns the number of bytes of variable length data after decoding.
1020    ///
1021    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1022    /// writers.
1023    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1024        self.unencoded_byte_array_data_bytes
1025    }
1026
1027    /// Returns the repetition level histogram.
1028    ///
1029    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1030    /// `vec[0]` indicates how many rows the page contains.
1031    /// This field may not be set by older writers.
1032    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1033        self.repetition_level_histogram.as_ref()
1034    }
1035
1036    /// Returns the definition level histogram.
1037    ///
1038    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1039    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1040    /// This field may not be set by older writers.
1041    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1042        self.definition_level_histogram.as_ref()
1043    }
1044
1045    /// Method to convert from Thrift.
1046    pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1047        if cc.meta_data.is_none() {
1048            return Err(general_err!("Expected to have column metadata"));
1049        }
1050        let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1051        let column_type = Type::try_from(col_metadata.type_)?;
1052        let encodings = col_metadata
1053            .encodings
1054            .drain(0..)
1055            .map(Encoding::try_from)
1056            .collect::<Result<_>>()?;
1057        let compression = Compression::try_from(col_metadata.codec)?;
1058        let file_path = cc.file_path;
1059        let file_offset = cc.file_offset;
1060        let num_values = col_metadata.num_values;
1061        let total_compressed_size = col_metadata.total_compressed_size;
1062        let total_uncompressed_size = col_metadata.total_uncompressed_size;
1063        let data_page_offset = col_metadata.data_page_offset;
1064        let index_page_offset = col_metadata.index_page_offset;
1065        let dictionary_page_offset = col_metadata.dictionary_page_offset;
1066        let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1067        let encoding_stats = col_metadata
1068            .encoding_stats
1069            .as_ref()
1070            .map(|vec| {
1071                vec.iter()
1072                    .map(page_encoding_stats::try_from_thrift)
1073                    .collect::<Result<_>>()
1074            })
1075            .transpose()?;
1076        let bloom_filter_offset = col_metadata.bloom_filter_offset;
1077        let bloom_filter_length = col_metadata.bloom_filter_length;
1078        let offset_index_offset = cc.offset_index_offset;
1079        let offset_index_length = cc.offset_index_length;
1080        let column_index_offset = cc.column_index_offset;
1081        let column_index_length = cc.column_index_length;
1082        let (
1083            unencoded_byte_array_data_bytes,
1084            repetition_level_histogram,
1085            definition_level_histogram,
1086        ) = if let Some(size_stats) = col_metadata.size_statistics {
1087            (
1088                size_stats.unencoded_byte_array_data_bytes,
1089                size_stats.repetition_level_histogram,
1090                size_stats.definition_level_histogram,
1091            )
1092        } else {
1093            (None, None, None)
1094        };
1095
1096        let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1097        let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1098
1099        let result = ColumnChunkMetaData {
1100            column_descr,
1101            encodings,
1102            file_path,
1103            file_offset,
1104            num_values,
1105            compression,
1106            total_compressed_size,
1107            total_uncompressed_size,
1108            data_page_offset,
1109            index_page_offset,
1110            dictionary_page_offset,
1111            statistics,
1112            encoding_stats,
1113            bloom_filter_offset,
1114            bloom_filter_length,
1115            offset_index_offset,
1116            offset_index_length,
1117            column_index_offset,
1118            column_index_length,
1119            unencoded_byte_array_data_bytes,
1120            repetition_level_histogram,
1121            definition_level_histogram,
1122        };
1123        Ok(result)
1124    }
1125
1126    /// Method to convert to Thrift.
1127    pub fn to_thrift(&self) -> ColumnChunk {
1128        let column_metadata = self.to_column_metadata_thrift();
1129
1130        ColumnChunk {
1131            file_path: self.file_path().map(|s| s.to_owned()),
1132            file_offset: self.file_offset,
1133            meta_data: Some(column_metadata),
1134            offset_index_offset: self.offset_index_offset,
1135            offset_index_length: self.offset_index_length,
1136            column_index_offset: self.column_index_offset,
1137            column_index_length: self.column_index_length,
1138            crypto_metadata: None,
1139            encrypted_column_metadata: None,
1140        }
1141    }
1142
1143    /// Method to convert to Thrift `ColumnMetaData`
1144    pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1145        let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1146            || self.repetition_level_histogram.is_some()
1147            || self.definition_level_histogram.is_some()
1148        {
1149            let repetition_level_histogram = self
1150                .repetition_level_histogram
1151                .as_ref()
1152                .map(|hist| hist.clone().into_inner());
1153
1154            let definition_level_histogram = self
1155                .definition_level_histogram
1156                .as_ref()
1157                .map(|hist| hist.clone().into_inner());
1158
1159            Some(SizeStatistics {
1160                unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1161                repetition_level_histogram,
1162                definition_level_histogram,
1163            })
1164        } else {
1165            None
1166        };
1167
1168        ColumnMetaData {
1169            type_: self.column_type().into(),
1170            encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1171            path_in_schema: self.column_path().as_ref().to_vec(),
1172            codec: self.compression.into(),
1173            num_values: self.num_values,
1174            total_uncompressed_size: self.total_uncompressed_size,
1175            total_compressed_size: self.total_compressed_size,
1176            key_value_metadata: None,
1177            data_page_offset: self.data_page_offset,
1178            index_page_offset: self.index_page_offset,
1179            dictionary_page_offset: self.dictionary_page_offset,
1180            statistics: statistics::to_thrift(self.statistics.as_ref()),
1181            encoding_stats: self
1182                .encoding_stats
1183                .as_ref()
1184                .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1185            bloom_filter_offset: self.bloom_filter_offset,
1186            bloom_filter_length: self.bloom_filter_length,
1187            size_statistics,
1188        }
1189    }
1190
1191    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1192    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1193        ColumnChunkMetaDataBuilder::from(self)
1194    }
1195}
1196
1197/// Builder for [`ColumnChunkMetaData`]
1198///
1199/// This builder is used to create a new column chunk metadata or modify an
1200/// existing one.
1201///
1202/// # Example
1203/// ```no_run
1204/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1205/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1206/// let column_chunk_metadata = get_column_chunk_metadata();
1207/// // create a new builder from existing column chunk metadata
1208/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1209/// // clear the statistics:
1210/// let column_chunk_metadata: ColumnChunkMetaData = builder
1211///   .clear_statistics()
1212///   .build()
1213///   .unwrap();
1214/// ```
1215pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1216
1217impl ColumnChunkMetaDataBuilder {
1218    /// Creates new column chunk metadata builder.
1219    ///
1220    /// See also [`ColumnChunkMetaData::builder`]
1221    fn new(column_descr: ColumnDescPtr) -> Self {
1222        Self(ColumnChunkMetaData {
1223            column_descr,
1224            encodings: Vec::new(),
1225            file_path: None,
1226            file_offset: 0,
1227            num_values: 0,
1228            compression: Compression::UNCOMPRESSED,
1229            total_compressed_size: 0,
1230            total_uncompressed_size: 0,
1231            data_page_offset: 0,
1232            index_page_offset: None,
1233            dictionary_page_offset: None,
1234            statistics: None,
1235            encoding_stats: None,
1236            bloom_filter_offset: None,
1237            bloom_filter_length: None,
1238            offset_index_offset: None,
1239            offset_index_length: None,
1240            column_index_offset: None,
1241            column_index_length: None,
1242            unencoded_byte_array_data_bytes: None,
1243            repetition_level_histogram: None,
1244            definition_level_histogram: None,
1245        })
1246    }
1247
1248    /// Sets list of encodings for this column chunk.
1249    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1250        self.0.encodings = encodings;
1251        self
1252    }
1253
1254    /// Sets optional file path for this column chunk.
1255    pub fn set_file_path(mut self, value: String) -> Self {
1256        self.0.file_path = Some(value);
1257        self
1258    }
1259
1260    /// Sets file offset in bytes.
1261    ///
1262    /// This field was meant to provide an alternate to storing `ColumnMetadata` directly in
1263    /// the `ColumnChunkMetadata`. However, most Parquet readers assume the `ColumnMetadata`
1264    /// is stored inline and ignore this field.
1265    #[deprecated(
1266        since = "53.0.0",
1267        note = "The Parquet specification requires this field to be 0"
1268    )]
1269    pub fn set_file_offset(mut self, value: i64) -> Self {
1270        self.0.file_offset = value;
1271        self
1272    }
1273
1274    /// Sets number of values.
1275    pub fn set_num_values(mut self, value: i64) -> Self {
1276        self.0.num_values = value;
1277        self
1278    }
1279
1280    /// Sets compression.
1281    pub fn set_compression(mut self, value: Compression) -> Self {
1282        self.0.compression = value;
1283        self
1284    }
1285
1286    /// Sets total compressed size in bytes.
1287    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1288        self.0.total_compressed_size = value;
1289        self
1290    }
1291
1292    /// Sets total uncompressed size in bytes.
1293    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1294        self.0.total_uncompressed_size = value;
1295        self
1296    }
1297
1298    /// Sets data page offset in bytes.
1299    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1300        self.0.data_page_offset = value;
1301        self
1302    }
1303
1304    /// Sets optional dictionary page offset in bytes.
1305    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1306        self.0.dictionary_page_offset = value;
1307        self
1308    }
1309
1310    /// Sets optional index page offset in bytes.
1311    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1312        self.0.index_page_offset = value;
1313        self
1314    }
1315
1316    /// Sets statistics for this column chunk.
1317    pub fn set_statistics(mut self, value: Statistics) -> Self {
1318        self.0.statistics = Some(value);
1319        self
1320    }
1321
1322    /// Clears the statistics for this column chunk.
1323    pub fn clear_statistics(mut self) -> Self {
1324        self.0.statistics = None;
1325        self
1326    }
1327
1328    /// Sets page encoding stats for this column chunk.
1329    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1330        self.0.encoding_stats = Some(value);
1331        self
1332    }
1333
1334    /// Clears the page encoding stats for this column chunk.
1335    pub fn clear_page_encoding_stats(mut self) -> Self {
1336        self.0.encoding_stats = None;
1337        self
1338    }
1339
1340    /// Sets optional bloom filter offset in bytes.
1341    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1342        self.0.bloom_filter_offset = value;
1343        self
1344    }
1345
1346    /// Sets optional bloom filter length in bytes.
1347    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1348        self.0.bloom_filter_length = value;
1349        self
1350    }
1351
1352    /// Sets optional offset index offset in bytes.
1353    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1354        self.0.offset_index_offset = value;
1355        self
1356    }
1357
1358    /// Sets optional offset index length in bytes.
1359    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1360        self.0.offset_index_length = value;
1361        self
1362    }
1363
1364    /// Sets optional column index offset in bytes.
1365    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1366        self.0.column_index_offset = value;
1367        self
1368    }
1369
1370    /// Sets optional column index length in bytes.
1371    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1372        self.0.column_index_length = value;
1373        self
1374    }
1375
1376    /// Sets optional length of variable length data in bytes.
1377    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1378        self.0.unencoded_byte_array_data_bytes = value;
1379        self
1380    }
1381
1382    /// Sets optional repetition level histogram
1383    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1384        self.0.repetition_level_histogram = value;
1385        self
1386    }
1387
1388    /// Sets optional repetition level histogram
1389    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1390        self.0.definition_level_histogram = value;
1391        self
1392    }
1393
1394    /// Builds column chunk metadata.
1395    pub fn build(self) -> Result<ColumnChunkMetaData> {
1396        Ok(self.0)
1397    }
1398}
1399
1400/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1401///
1402/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1403pub struct ColumnIndexBuilder {
1404    null_pages: Vec<bool>,
1405    min_values: Vec<Vec<u8>>,
1406    max_values: Vec<Vec<u8>>,
1407    null_counts: Vec<i64>,
1408    boundary_order: BoundaryOrder,
1409    /// contains the concatenation of the histograms of all pages
1410    repetition_level_histograms: Option<Vec<i64>>,
1411    /// contains the concatenation of the histograms of all pages
1412    definition_level_histograms: Option<Vec<i64>>,
1413    /// Is the information in the builder valid?
1414    ///
1415    /// Set to `false` if any entry in the page doesn't have statistics for
1416    /// some reason, so statistics for that page won't be written to the file.
1417    /// This might happen if the page is entirely null, or
1418    /// is a floating point column without any non-nan values
1419    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1420    valid: bool,
1421}
1422
1423impl Default for ColumnIndexBuilder {
1424    fn default() -> Self {
1425        Self::new()
1426    }
1427}
1428
1429impl ColumnIndexBuilder {
1430    /// Creates a new column index builder.
1431    pub fn new() -> Self {
1432        ColumnIndexBuilder {
1433            null_pages: Vec::new(),
1434            min_values: Vec::new(),
1435            max_values: Vec::new(),
1436            null_counts: Vec::new(),
1437            boundary_order: BoundaryOrder::UNORDERED,
1438            repetition_level_histograms: None,
1439            definition_level_histograms: None,
1440            valid: true,
1441        }
1442    }
1443
1444    /// Append statistics for the next page
1445    pub fn append(
1446        &mut self,
1447        null_page: bool,
1448        min_value: Vec<u8>,
1449        max_value: Vec<u8>,
1450        null_count: i64,
1451    ) {
1452        self.null_pages.push(null_page);
1453        self.min_values.push(min_value);
1454        self.max_values.push(max_value);
1455        self.null_counts.push(null_count);
1456    }
1457
1458    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1459    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1460    pub fn append_histograms(
1461        &mut self,
1462        repetition_level_histogram: &Option<LevelHistogram>,
1463        definition_level_histogram: &Option<LevelHistogram>,
1464    ) {
1465        if !self.valid {
1466            return;
1467        }
1468        if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1469            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1470            hist.reserve(rep_lvl_hist.len());
1471            hist.extend(rep_lvl_hist.values());
1472        }
1473        if let Some(ref def_lvl_hist) = definition_level_histogram {
1474            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1475            hist.reserve(def_lvl_hist.len());
1476            hist.extend(def_lvl_hist.values());
1477        }
1478    }
1479
1480    /// Set the boundary order of the column index
1481    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1482        self.boundary_order = boundary_order;
1483    }
1484
1485    /// Mark this column index as invalid
1486    pub fn to_invalid(&mut self) {
1487        self.valid = false;
1488    }
1489
1490    /// Is the information in the builder valid?
1491    pub fn valid(&self) -> bool {
1492        self.valid
1493    }
1494
1495    /// Build and get the thrift metadata of column index
1496    ///
1497    /// Note: callers should check [`Self::valid`] before calling this method
1498    pub fn build_to_thrift(self) -> ColumnIndex {
1499        ColumnIndex::new(
1500            self.null_pages,
1501            self.min_values,
1502            self.max_values,
1503            self.boundary_order,
1504            self.null_counts,
1505            self.repetition_level_histograms,
1506            self.definition_level_histograms,
1507        )
1508    }
1509}
1510
1511impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1512    fn from(value: ColumnChunkMetaData) -> Self {
1513        ColumnChunkMetaDataBuilder(value)
1514    }
1515}
1516
1517/// Builder for offset index, part of the Parquet [PageIndex].
1518///
1519/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1520pub struct OffsetIndexBuilder {
1521    offset_array: Vec<i64>,
1522    compressed_page_size_array: Vec<i32>,
1523    first_row_index_array: Vec<i64>,
1524    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1525    current_first_row_index: i64,
1526}
1527
1528impl Default for OffsetIndexBuilder {
1529    fn default() -> Self {
1530        Self::new()
1531    }
1532}
1533
1534impl OffsetIndexBuilder {
1535    /// Creates a new offset index builder.
1536    pub fn new() -> Self {
1537        OffsetIndexBuilder {
1538            offset_array: Vec::new(),
1539            compressed_page_size_array: Vec::new(),
1540            first_row_index_array: Vec::new(),
1541            unencoded_byte_array_data_bytes_array: None,
1542            current_first_row_index: 0,
1543        }
1544    }
1545
1546    /// Append the row count of the next page.
1547    pub fn append_row_count(&mut self, row_count: i64) {
1548        let current_page_row_index = self.current_first_row_index;
1549        self.first_row_index_array.push(current_page_row_index);
1550        self.current_first_row_index += row_count;
1551    }
1552
1553    /// Append the offset and size of the next page.
1554    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1555        self.offset_array.push(offset);
1556        self.compressed_page_size_array.push(compressed_page_size);
1557    }
1558
1559    /// Append the unencoded byte array data bytes of the next page.
1560    pub fn append_unencoded_byte_array_data_bytes(
1561        &mut self,
1562        unencoded_byte_array_data_bytes: Option<i64>,
1563    ) {
1564        if let Some(val) = unencoded_byte_array_data_bytes {
1565            self.unencoded_byte_array_data_bytes_array
1566                .get_or_insert(Vec::new())
1567                .push(val);
1568        }
1569    }
1570
1571    /// Build and get the thrift metadata of offset index
1572    pub fn build_to_thrift(self) -> OffsetIndex {
1573        let locations = self
1574            .offset_array
1575            .iter()
1576            .zip(self.compressed_page_size_array.iter())
1577            .zip(self.first_row_index_array.iter())
1578            .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1579            .collect::<Vec<_>>();
1580        OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1581    }
1582}
1583
1584#[cfg(test)]
1585mod tests {
1586    use super::*;
1587    use crate::basic::{PageType, SortOrder};
1588    use crate::file::page_index::index::NativeIndex;
1589
1590    #[test]
1591    fn test_row_group_metadata_thrift_conversion() {
1592        let schema_descr = get_test_schema_descr();
1593
1594        let mut columns = vec![];
1595        for ptr in schema_descr.columns() {
1596            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1597            columns.push(column);
1598        }
1599        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1600            .set_num_rows(1000)
1601            .set_total_byte_size(2000)
1602            .set_column_metadata(columns)
1603            .set_ordinal(1)
1604            .build()
1605            .unwrap();
1606
1607        let row_group_exp = row_group_meta.to_thrift();
1608        let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1609            .unwrap()
1610            .to_thrift();
1611
1612        assert_eq!(row_group_res, row_group_exp);
1613    }
1614
1615    #[test]
1616    fn test_row_group_metadata_thrift_conversion_empty() {
1617        let schema_descr = get_test_schema_descr();
1618
1619        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1620
1621        assert!(row_group_meta.is_err());
1622        if let Err(e) = row_group_meta {
1623            assert_eq!(
1624                format!("{e}"),
1625                "Parquet error: Column length mismatch: 2 != 0"
1626            );
1627        }
1628    }
1629
1630    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1631    #[test]
1632    fn test_row_group_metadata_thrift_corrupted() {
1633        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1634            SchemaType::group_type_builder("schema")
1635                .with_fields(vec![
1636                    Arc::new(
1637                        SchemaType::primitive_type_builder("a", Type::INT32)
1638                            .build()
1639                            .unwrap(),
1640                    ),
1641                    Arc::new(
1642                        SchemaType::primitive_type_builder("b", Type::INT32)
1643                            .build()
1644                            .unwrap(),
1645                    ),
1646                ])
1647                .build()
1648                .unwrap(),
1649        )));
1650
1651        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1652            SchemaType::group_type_builder("schema")
1653                .with_fields(vec![
1654                    Arc::new(
1655                        SchemaType::primitive_type_builder("a", Type::INT32)
1656                            .build()
1657                            .unwrap(),
1658                    ),
1659                    Arc::new(
1660                        SchemaType::primitive_type_builder("b", Type::INT32)
1661                            .build()
1662                            .unwrap(),
1663                    ),
1664                    Arc::new(
1665                        SchemaType::primitive_type_builder("c", Type::INT32)
1666                            .build()
1667                            .unwrap(),
1668                    ),
1669                ])
1670                .build()
1671                .unwrap(),
1672        )));
1673
1674        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1675            .set_num_rows(1000)
1676            .set_total_byte_size(2000)
1677            .set_column_metadata(vec![
1678                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1679                    .build()
1680                    .unwrap(),
1681                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1682                    .build()
1683                    .unwrap(),
1684            ])
1685            .set_ordinal(1)
1686            .build()
1687            .unwrap();
1688
1689        let err =
1690            RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1691                .unwrap_err()
1692                .to_string();
1693        assert_eq!(
1694            err,
1695            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1696        );
1697    }
1698
1699    #[test]
1700    fn test_column_chunk_metadata_thrift_conversion() {
1701        let column_descr = get_test_schema_descr().column(0);
1702
1703        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1704            .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1705            .set_file_path("file_path".to_owned())
1706            .set_num_values(1000)
1707            .set_compression(Compression::SNAPPY)
1708            .set_total_compressed_size(2000)
1709            .set_total_uncompressed_size(3000)
1710            .set_data_page_offset(4000)
1711            .set_dictionary_page_offset(Some(5000))
1712            .set_page_encoding_stats(vec![
1713                PageEncodingStats {
1714                    page_type: PageType::DATA_PAGE,
1715                    encoding: Encoding::PLAIN,
1716                    count: 3,
1717                },
1718                PageEncodingStats {
1719                    page_type: PageType::DATA_PAGE,
1720                    encoding: Encoding::RLE,
1721                    count: 5,
1722                },
1723            ])
1724            .set_bloom_filter_offset(Some(6000))
1725            .set_bloom_filter_length(Some(25))
1726            .set_offset_index_offset(Some(7000))
1727            .set_offset_index_length(Some(25))
1728            .set_column_index_offset(Some(8000))
1729            .set_column_index_length(Some(25))
1730            .set_unencoded_byte_array_data_bytes(Some(2000))
1731            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1732            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1733            .build()
1734            .unwrap();
1735
1736        let col_chunk_res =
1737            ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1738
1739        assert_eq!(col_chunk_res, col_metadata);
1740    }
1741
1742    #[test]
1743    fn test_column_chunk_metadata_thrift_conversion_empty() {
1744        let column_descr = get_test_schema_descr().column(0);
1745
1746        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1747            .build()
1748            .unwrap();
1749
1750        let col_chunk_exp = col_metadata.to_thrift();
1751        let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1752            .unwrap()
1753            .to_thrift();
1754
1755        assert_eq!(col_chunk_res, col_chunk_exp);
1756    }
1757
1758    #[test]
1759    fn test_compressed_size() {
1760        let schema_descr = get_test_schema_descr();
1761
1762        let mut columns = vec![];
1763        for column_descr in schema_descr.columns() {
1764            let column = ColumnChunkMetaData::builder(column_descr.clone())
1765                .set_total_compressed_size(500)
1766                .set_total_uncompressed_size(700)
1767                .build()
1768                .unwrap();
1769            columns.push(column);
1770        }
1771        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1772            .set_num_rows(1000)
1773            .set_column_metadata(columns)
1774            .build()
1775            .unwrap();
1776
1777        let compressed_size_res: i64 = row_group_meta.compressed_size();
1778        let compressed_size_exp: i64 = 1000;
1779
1780        assert_eq!(compressed_size_res, compressed_size_exp);
1781    }
1782
1783    #[test]
1784    fn test_memory_size() {
1785        let schema_descr = get_test_schema_descr();
1786
1787        let columns = schema_descr
1788            .columns()
1789            .iter()
1790            .map(|column_descr| {
1791                ColumnChunkMetaData::builder(column_descr.clone())
1792                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1793                    .build()
1794            })
1795            .collect::<Result<Vec<_>>>()
1796            .unwrap();
1797        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1798            .set_num_rows(1000)
1799            .set_column_metadata(columns)
1800            .build()
1801            .unwrap();
1802        let row_group_meta = vec![row_group_meta];
1803
1804        let version = 2;
1805        let num_rows = 1000;
1806        let created_by = Some(String::from("test harness"));
1807        let key_value_metadata = Some(vec![KeyValue::new(
1808            String::from("Foo"),
1809            Some(String::from("bar")),
1810        )]);
1811        let column_orders = Some(vec![
1812            ColumnOrder::UNDEFINED,
1813            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1814        ]);
1815        let file_metadata = FileMetaData::new(
1816            version,
1817            num_rows,
1818            created_by,
1819            key_value_metadata,
1820            schema_descr.clone(),
1821            column_orders,
1822        );
1823
1824        // Now, add in Exact Statistics
1825        let columns_with_stats = schema_descr
1826            .columns()
1827            .iter()
1828            .map(|column_descr| {
1829                ColumnChunkMetaData::builder(column_descr.clone())
1830                    .set_statistics(Statistics::new::<i32>(
1831                        Some(0),
1832                        Some(100),
1833                        None,
1834                        None,
1835                        false,
1836                    ))
1837                    .build()
1838            })
1839            .collect::<Result<Vec<_>>>()
1840            .unwrap();
1841
1842        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1843            .set_num_rows(1000)
1844            .set_column_metadata(columns_with_stats)
1845            .build()
1846            .unwrap();
1847        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1848
1849        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1850            .set_row_groups(row_group_meta_with_stats)
1851            .build();
1852        let base_expected_size = 2312;
1853
1854        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1855
1856        let mut column_index = ColumnIndexBuilder::new();
1857        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1858        let column_index = column_index.build_to_thrift();
1859        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
1860
1861        // Now, add in OffsetIndex
1862        let mut offset_index = OffsetIndexBuilder::new();
1863        offset_index.append_row_count(1);
1864        offset_index.append_offset_and_size(2, 3);
1865        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1866        offset_index.append_row_count(1);
1867        offset_index.append_offset_and_size(2, 3);
1868        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1869        let offset_index = offset_index.build_to_thrift();
1870
1871        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1872            .set_row_groups(row_group_meta)
1873            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
1874            .set_offset_index(Some(vec![vec![
1875                OffsetIndexMetaData::try_new(offset_index).unwrap()
1876            ]]))
1877            .build();
1878
1879        let bigger_expected_size = 2816;
1880        // more set fields means more memory usage
1881        assert!(bigger_expected_size > base_expected_size);
1882        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1883    }
1884
1885    /// Returns sample schema descriptor so we can create column metadata.
1886    fn get_test_schema_descr() -> SchemaDescPtr {
1887        let schema = SchemaType::group_type_builder("schema")
1888            .with_fields(vec![
1889                Arc::new(
1890                    SchemaType::primitive_type_builder("a", Type::INT32)
1891                        .build()
1892                        .unwrap(),
1893                ),
1894                Arc::new(
1895                    SchemaType::primitive_type_builder("b", Type::INT32)
1896                        .build()
1897                        .unwrap(),
1898                ),
1899            ])
1900            .build()
1901            .unwrap();
1902
1903        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
1904    }
1905}