parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//!
46//! # Examples
47//!
48//! Please see [`external_metadata.rs`]
49//!
50//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
51//!
52//! # Metadata Encodings and Structures
53//!
54//! There are three different encodings of Parquet Metadata in this crate:
55//!
56//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
57//!    [parquet.thrift]
58//!
59//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
60//!    from [parquet.thrift]. These structures are low level and mirror
61//!    the thrift definitions.
62//!
63//! 3. [`file::metadata`] (this module): Easier to use Rust structures
64//!    with a more idiomatic API. Note that, confusingly, some but not all
65//!    of these structures have the same name as the [`format`] structures.
66//!
67//! [`file::metadata`]: crate::file::metadata
68//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
69//!
70//! Graphically, this is how the different structures relate to each other:
71//!
72//! ```text
73//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
74//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
75//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
76//!                            └──────────────┘     │         └───────────────────────┘ │
77//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
78//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
79//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
80//!                                     ...         │                   ...             │
81//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
82//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
83//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
84//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
85//!
86//!                          format::meta structures          file::metadata structures
87//!
88//!                         * Same name, different struct
89//! ```
90mod footer_tail;
91mod memory;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112    Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116    basic::BoundaryOrder,
117    errors::{ParquetError, Result},
118};
119use crate::{
120    basic::{ColumnOrder, Compression, Encoding, Type},
121    parquet_thrift::{
122        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124    },
125};
126use crate::{
127    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use push_decoder::ParquetMetaDataPushDecoder;
132pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
133use std::io::Write;
134use std::ops::Range;
135use std::sync::Arc;
136pub use writer::ParquetMetaDataWriter;
137pub(crate) use writer::ThriftMetadataWriter;
138
139/// Page level statistics for each column chunk of each row group.
140///
141/// This structure is an in-memory representation of multiple [`ColumnIndex`]
142/// structures in a parquet file footer, as described in the Parquet [PageIndex
143/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
144/// particular column chunk.
145///
146/// `column_index[row_group_number][column_number]` holds the
147/// [`ColumnIndex`] corresponding to column `column_number` of row group
148/// `row_group_number`.
149///
150/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
151/// column in the third row group of the parquet file.
152///
153/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
154/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
155pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
156
157/// [`OffsetIndexMetaData`] for each data page of each row group of each column
158///
159/// This structure is the parsed representation of the [`OffsetIndex`] from the
160/// Parquet file footer, as described in the Parquet [PageIndex documentation].
161///
162/// `offset_index[row_group_number][column_number]` holds
163/// the [`OffsetIndexMetaData`] corresponding to column
164/// `column_number`of row group `row_group_number`.
165///
166/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
167/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
168pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
169
170/// Parsed metadata for a single Parquet file
171///
172/// This structure is stored in the footer of Parquet files, in the format
173/// defined by [`parquet.thrift`].
174///
175/// # Overview
176/// The fields of this structure are:
177/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
178/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
179/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
180///
181/// This structure is read by the various readers in this crate or can be read
182/// directly from a file using the [`ParquetMetaDataReader`] struct.
183///
184/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
185///
186/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
187#[derive(Debug, Clone, PartialEq)]
188pub struct ParquetMetaData {
189    /// File level metadata
190    file_metadata: FileMetaData,
191    /// Row group metadata
192    row_groups: Vec<RowGroupMetaData>,
193    /// Page level index for each page in each column chunk
194    column_index: Option<ParquetColumnIndex>,
195    /// Offset index for each page in each column chunk
196    offset_index: Option<ParquetOffsetIndex>,
197    /// Optional file decryptor
198    #[cfg(feature = "encryption")]
199    file_decryptor: Option<Box<FileDecryptor>>,
200}
201
202impl ParquetMetaData {
203    /// Creates Parquet metadata from file metadata and a list of row
204    /// group metadata
205    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
206        ParquetMetaData {
207            file_metadata,
208            row_groups,
209            column_index: None,
210            offset_index: None,
211            #[cfg(feature = "encryption")]
212            file_decryptor: None,
213        }
214    }
215
216    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
217    /// encrypted data.
218    #[cfg(feature = "encryption")]
219    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
220        self.file_decryptor = file_decryptor.map(Box::new);
221    }
222
223    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
224    pub fn into_builder(self) -> ParquetMetaDataBuilder {
225        self.into()
226    }
227
228    /// Returns file metadata as reference.
229    pub fn file_metadata(&self) -> &FileMetaData {
230        &self.file_metadata
231    }
232
233    /// Returns file decryptor as reference.
234    #[cfg(feature = "encryption")]
235    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
236        self.file_decryptor.as_deref()
237    }
238
239    /// Returns number of row groups in this file.
240    pub fn num_row_groups(&self) -> usize {
241        self.row_groups.len()
242    }
243
244    /// Returns row group metadata for `i`th position.
245    /// Position should be less than number of row groups `num_row_groups`.
246    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
247        &self.row_groups[i]
248    }
249
250    /// Returns slice of row groups in this file.
251    pub fn row_groups(&self) -> &[RowGroupMetaData] {
252        &self.row_groups
253    }
254
255    /// Returns the column index for this file if loaded
256    ///
257    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
258    /// [ArrowReaderOptions::with_page_index] was set to false.
259    ///
260    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
261    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
262        self.column_index.as_ref()
263    }
264
265    /// Returns offset indexes in this file, if loaded
266    ///
267    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
268    /// [ArrowReaderOptions::with_page_index] was set to false.
269    ///
270    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
271    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
272        self.offset_index.as_ref()
273    }
274
275    /// Estimate of the bytes allocated to store `ParquetMetadata`
276    ///
277    /// # Notes:
278    ///
279    /// 1. Includes size of self
280    ///
281    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
282    ///    [`RowGroupMetaData`].
283    ///
284    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
285    ///    means `memory_size` will over estimate the memory size if such pointers
286    ///    are shared.
287    ///
288    /// 4. Does not include any allocator overheads
289    pub fn memory_size(&self) -> usize {
290        std::mem::size_of::<Self>()
291            + self.file_metadata.heap_size()
292            + self.row_groups.heap_size()
293            + self.column_index.heap_size()
294            + self.offset_index.heap_size()
295    }
296
297    /// Override the column index
298    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
299        self.column_index = index;
300    }
301
302    /// Override the offset index
303    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
304        self.offset_index = index;
305    }
306}
307
308/// A builder for creating / manipulating [`ParquetMetaData`]
309///
310/// # Example creating a new [`ParquetMetaData`]
311///
312///```no_run
313/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
314/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
315/// // Create a new builder given the file metadata
316/// let file_metadata = get_file_metadata();
317/// // Create a row group
318/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
319///    .set_num_rows(100)
320///    // ... (A real row group needs more than just the number of rows)
321///    .build()
322///    .unwrap();
323/// // Create the final metadata
324/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
325///   .add_row_group(row_group)
326///   .build();
327/// ```
328///
329/// # Example modifying an existing [`ParquetMetaData`]
330/// ```no_run
331/// # use parquet::file::metadata::ParquetMetaData;
332/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
333/// // Modify the metadata so only the last RowGroup remains
334/// let metadata: ParquetMetaData = load_metadata();
335/// let mut builder = metadata.into_builder();
336///
337/// // Take existing row groups to modify
338/// let mut row_groups = builder.take_row_groups();
339/// let last_row_group = row_groups.pop().unwrap();
340///
341/// let metadata = builder
342///   .add_row_group(last_row_group)
343///   .build();
344/// ```
345pub struct ParquetMetaDataBuilder(ParquetMetaData);
346
347impl ParquetMetaDataBuilder {
348    /// Create a new builder from a file metadata, with no row groups
349    pub fn new(file_meta_data: FileMetaData) -> Self {
350        Self(ParquetMetaData::new(file_meta_data, vec![]))
351    }
352
353    /// Create a new builder from an existing ParquetMetaData
354    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
355        Self(metadata)
356    }
357
358    /// Adds a row group to the metadata
359    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
360        self.0.row_groups.push(row_group);
361        self
362    }
363
364    /// Sets all the row groups to the specified list
365    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
366        self.0.row_groups = row_groups;
367        self
368    }
369
370    /// Takes ownership of the row groups in this builder, and clears the list
371    /// of row groups.
372    ///
373    /// This can be used for more efficient creation of a new ParquetMetaData
374    /// from an existing one.
375    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
376        std::mem::take(&mut self.0.row_groups)
377    }
378
379    /// Return a reference to the current row groups
380    pub fn row_groups(&self) -> &[RowGroupMetaData] {
381        &self.0.row_groups
382    }
383
384    /// Sets the column index
385    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
386        self.0.column_index = column_index;
387        self
388    }
389
390    /// Returns the current column index from the builder, replacing it with `None`
391    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
392        std::mem::take(&mut self.0.column_index)
393    }
394
395    /// Return a reference to the current column index, if any
396    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
397        self.0.column_index.as_ref()
398    }
399
400    /// Sets the offset index
401    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
402        self.0.offset_index = offset_index;
403        self
404    }
405
406    /// Returns the current offset index from the builder, replacing it with `None`
407    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
408        std::mem::take(&mut self.0.offset_index)
409    }
410
411    /// Return a reference to the current offset index, if any
412    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
413        self.0.offset_index.as_ref()
414    }
415
416    /// Sets the file decryptor needed to decrypt this metadata.
417    #[cfg(feature = "encryption")]
418    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
419        self.0.with_file_decryptor(file_decryptor);
420        self
421    }
422
423    /// Creates a new ParquetMetaData from the builder
424    pub fn build(self) -> ParquetMetaData {
425        let Self(metadata) = self;
426        metadata
427    }
428}
429
430impl From<ParquetMetaData> for ParquetMetaDataBuilder {
431    fn from(meta_data: ParquetMetaData) -> Self {
432        Self(meta_data)
433    }
434}
435
436thrift_struct!(
437/// A key-value pair for [`FileMetaData`].
438pub struct KeyValue {
439  1: required string key
440  2: optional string value
441}
442);
443
444impl KeyValue {
445    /// Create a new key value pair
446    pub fn new<F2>(key: String, value: F2) -> KeyValue
447    where
448        F2: Into<Option<String>>,
449    {
450        KeyValue {
451            key,
452            value: value.into(),
453        }
454    }
455}
456
457thrift_struct!(
458/// PageEncodingStats for a column chunk and data page.
459pub struct PageEncodingStats {
460  1: required PageType page_type;
461  2: required Encoding encoding;
462  3: required i32 count;
463}
464);
465
466/// Reference counted pointer for [`FileMetaData`].
467pub type FileMetaDataPtr = Arc<FileMetaData>;
468
469/// File level metadata for a Parquet file.
470///
471/// Includes the version of the file, metadata, number of rows, schema, and column orders
472#[derive(Debug, Clone, PartialEq)]
473pub struct FileMetaData {
474    version: i32,
475    num_rows: i64,
476    created_by: Option<String>,
477    key_value_metadata: Option<Vec<KeyValue>>,
478    schema_descr: SchemaDescPtr,
479    column_orders: Option<Vec<ColumnOrder>>,
480    #[cfg(feature = "encryption")]
481    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
482    #[cfg(feature = "encryption")]
483    footer_signing_key_metadata: Option<Vec<u8>>,
484}
485
486impl FileMetaData {
487    /// Creates new file metadata.
488    pub fn new(
489        version: i32,
490        num_rows: i64,
491        created_by: Option<String>,
492        key_value_metadata: Option<Vec<KeyValue>>,
493        schema_descr: SchemaDescPtr,
494        column_orders: Option<Vec<ColumnOrder>>,
495    ) -> Self {
496        FileMetaData {
497            version,
498            num_rows,
499            created_by,
500            key_value_metadata,
501            schema_descr,
502            column_orders,
503            #[cfg(feature = "encryption")]
504            encryption_algorithm: None,
505            #[cfg(feature = "encryption")]
506            footer_signing_key_metadata: None,
507        }
508    }
509
510    #[cfg(feature = "encryption")]
511    pub(crate) fn with_encryption_algorithm(
512        mut self,
513        encryption_algorithm: Option<EncryptionAlgorithm>,
514    ) -> Self {
515        self.encryption_algorithm = encryption_algorithm.map(Box::new);
516        self
517    }
518
519    #[cfg(feature = "encryption")]
520    pub(crate) fn with_footer_signing_key_metadata(
521        mut self,
522        footer_signing_key_metadata: Option<Vec<u8>>,
523    ) -> Self {
524        self.footer_signing_key_metadata = footer_signing_key_metadata;
525        self
526    }
527
528    /// Returns version of this file.
529    pub fn version(&self) -> i32 {
530        self.version
531    }
532
533    /// Returns number of rows in the file.
534    pub fn num_rows(&self) -> i64 {
535        self.num_rows
536    }
537
538    /// String message for application that wrote this file.
539    ///
540    /// This should have the following format:
541    /// `<application> version <application version> (build <application build hash>)`.
542    ///
543    /// ```shell
544    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
545    /// ```
546    pub fn created_by(&self) -> Option<&str> {
547        self.created_by.as_deref()
548    }
549
550    /// Returns key_value_metadata of this file.
551    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
552        self.key_value_metadata.as_ref()
553    }
554
555    /// Returns Parquet [`Type`] that describes schema in this file.
556    ///
557    /// [`Type`]: crate::schema::types::Type
558    pub fn schema(&self) -> &SchemaType {
559        self.schema_descr.root_schema()
560    }
561
562    /// Returns a reference to schema descriptor.
563    pub fn schema_descr(&self) -> &SchemaDescriptor {
564        &self.schema_descr
565    }
566
567    /// Returns reference counted clone for schema descriptor.
568    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
569        self.schema_descr.clone()
570    }
571
572    /// Column (sort) order used for `min` and `max` values of each column in this file.
573    ///
574    /// Each column order corresponds to one column, determined by its position in the
575    /// list, matching the position of the column in the schema.
576    ///
577    /// When `None` is returned, there are no column orders available, and each column
578    /// should be assumed to have undefined (legacy) column order.
579    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
580        self.column_orders.as_ref()
581    }
582
583    /// Returns column order for `i`th column in this file.
584    /// If column orders are not available, returns undefined (legacy) column order.
585    pub fn column_order(&self, i: usize) -> ColumnOrder {
586        self.column_orders
587            .as_ref()
588            .map(|data| data[i])
589            .unwrap_or(ColumnOrder::UNDEFINED)
590    }
591}
592
593thrift_struct!(
594/// Sort order within a RowGroup of a leaf column
595pub struct SortingColumn {
596  /// The ordinal position of the column (in this row group)
597  1: required i32 column_idx
598
599  /// If true, indicates this column is sorted in descending order.
600  2: required bool descending
601
602  /// If true, nulls will come before non-null values, otherwise,
603  /// nulls go at the end. */
604  3: required bool nulls_first
605}
606);
607
608/// Reference counted pointer for [`RowGroupMetaData`].
609pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
610
611/// Metadata for a row group
612///
613/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
614/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
615#[derive(Debug, Clone, PartialEq)]
616pub struct RowGroupMetaData {
617    columns: Vec<ColumnChunkMetaData>,
618    num_rows: i64,
619    sorting_columns: Option<Vec<SortingColumn>>,
620    total_byte_size: i64,
621    schema_descr: SchemaDescPtr,
622    /// We can't infer from file offset of first column since there may empty columns in row group.
623    file_offset: Option<i64>,
624    /// Ordinal position of this row group in file
625    ordinal: Option<i16>,
626}
627
628impl RowGroupMetaData {
629    /// Returns builder for row group metadata.
630    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
631        RowGroupMetaDataBuilder::new(schema_descr)
632    }
633
634    /// Number of columns in this row group.
635    pub fn num_columns(&self) -> usize {
636        self.columns.len()
637    }
638
639    /// Returns column chunk metadata for `i`th column.
640    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
641        &self.columns[i]
642    }
643
644    /// Returns slice of column chunk metadata.
645    pub fn columns(&self) -> &[ColumnChunkMetaData] {
646        &self.columns
647    }
648
649    /// Returns mutable slice of column chunk metadata.
650    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
651        &mut self.columns
652    }
653
654    /// Number of rows in this row group.
655    pub fn num_rows(&self) -> i64 {
656        self.num_rows
657    }
658
659    /// Returns the sort ordering of the rows in this RowGroup if any
660    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
661        self.sorting_columns.as_ref()
662    }
663
664    /// Total byte size of all uncompressed column data in this row group.
665    pub fn total_byte_size(&self) -> i64 {
666        self.total_byte_size
667    }
668
669    /// Total size of all compressed column data in this row group.
670    pub fn compressed_size(&self) -> i64 {
671        self.columns.iter().map(|c| c.total_compressed_size).sum()
672    }
673
674    /// Returns reference to a schema descriptor.
675    pub fn schema_descr(&self) -> &SchemaDescriptor {
676        self.schema_descr.as_ref()
677    }
678
679    /// Returns reference counted clone of schema descriptor.
680    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
681        self.schema_descr.clone()
682    }
683
684    /// Returns ordinal position of this row group in file.
685    ///
686    /// For example if this is the first row group in the file, this will return 0.
687    /// If this is the second row group in the file, this will return 1.
688    #[inline(always)]
689    pub fn ordinal(&self) -> Option<i16> {
690        self.ordinal
691    }
692
693    /// Returns file offset of this row group in file.
694    #[inline(always)]
695    pub fn file_offset(&self) -> Option<i64> {
696        self.file_offset
697    }
698
699    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
700    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
701        RowGroupMetaDataBuilder(self)
702    }
703}
704
705/// Builder for row group metadata.
706pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
707
708impl RowGroupMetaDataBuilder {
709    /// Creates new builder from schema descriptor.
710    fn new(schema_descr: SchemaDescPtr) -> Self {
711        Self(RowGroupMetaData {
712            columns: Vec::with_capacity(schema_descr.num_columns()),
713            schema_descr,
714            file_offset: None,
715            num_rows: 0,
716            sorting_columns: None,
717            total_byte_size: 0,
718            ordinal: None,
719        })
720    }
721
722    /// Sets number of rows in this row group.
723    pub fn set_num_rows(mut self, value: i64) -> Self {
724        self.0.num_rows = value;
725        self
726    }
727
728    /// Sets the sorting order for columns
729    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
730        self.0.sorting_columns = value;
731        self
732    }
733
734    /// Sets total size in bytes for this row group.
735    pub fn set_total_byte_size(mut self, value: i64) -> Self {
736        self.0.total_byte_size = value;
737        self
738    }
739
740    /// Takes ownership of the the column metadata in this builder, and clears
741    /// the list of columns.
742    ///
743    /// This can be used for more efficient creation of a new RowGroupMetaData
744    /// from an existing one.
745    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
746        std::mem::take(&mut self.0.columns)
747    }
748
749    /// Sets column metadata for this row group.
750    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
751        self.0.columns = value;
752        self
753    }
754
755    /// Adds a column metadata to this row group
756    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
757        self.0.columns.push(value);
758        self
759    }
760
761    /// Sets ordinal for this row group.
762    pub fn set_ordinal(mut self, value: i16) -> Self {
763        self.0.ordinal = Some(value);
764        self
765    }
766
767    /// Sets file offset for this row group.
768    pub fn set_file_offset(mut self, value: i64) -> Self {
769        self.0.file_offset = Some(value);
770        self
771    }
772
773    /// Builds row group metadata.
774    pub fn build(self) -> Result<RowGroupMetaData> {
775        if self.0.schema_descr.num_columns() != self.0.columns.len() {
776            return Err(general_err!(
777                "Column length mismatch: {} != {}",
778                self.0.schema_descr.num_columns(),
779                self.0.columns.len()
780            ));
781        }
782
783        Ok(self.0)
784    }
785
786    /// Build row group metadata without validation.
787    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
788        self.0
789    }
790}
791
792/// Metadata for a column chunk.
793#[derive(Debug, Clone, PartialEq)]
794pub struct ColumnChunkMetaData {
795    column_descr: ColumnDescPtr,
796    encodings: EncodingMask,
797    file_path: Option<String>,
798    file_offset: i64,
799    num_values: i64,
800    compression: Compression,
801    total_compressed_size: i64,
802    total_uncompressed_size: i64,
803    data_page_offset: i64,
804    index_page_offset: Option<i64>,
805    dictionary_page_offset: Option<i64>,
806    statistics: Option<Statistics>,
807    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
808    encoding_stats: Option<Vec<PageEncodingStats>>,
809    bloom_filter_offset: Option<i64>,
810    bloom_filter_length: Option<i32>,
811    offset_index_offset: Option<i64>,
812    offset_index_length: Option<i32>,
813    column_index_offset: Option<i64>,
814    column_index_length: Option<i32>,
815    unencoded_byte_array_data_bytes: Option<i64>,
816    repetition_level_histogram: Option<LevelHistogram>,
817    definition_level_histogram: Option<LevelHistogram>,
818    #[cfg(feature = "encryption")]
819    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
820    #[cfg(feature = "encryption")]
821    encrypted_column_metadata: Option<Vec<u8>>,
822}
823
824/// Histograms for repetition and definition levels.
825///
826/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
827/// values at level `i`.
828///
829/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
830/// number of rows with level 1, and so on.
831///
832#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
833pub struct LevelHistogram {
834    inner: Vec<i64>,
835}
836
837impl LevelHistogram {
838    /// Creates a new level histogram data.
839    ///
840    /// Length will be `max_level + 1`.
841    ///
842    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
843    pub fn try_new(max_level: i16) -> Option<Self> {
844        if max_level > 0 {
845            Some(Self {
846                inner: vec![0; max_level as usize + 1],
847            })
848        } else {
849            None
850        }
851    }
852    /// Returns a reference to the the histogram's values.
853    pub fn values(&self) -> &[i64] {
854        &self.inner
855    }
856
857    /// Return the inner vector, consuming self
858    pub fn into_inner(self) -> Vec<i64> {
859        self.inner
860    }
861
862    /// Returns the histogram value at the given index.
863    ///
864    /// The value of `i` is the number of values with level `i`. For example,
865    /// `get(1)` returns the number of values with level 1.
866    ///
867    /// Returns `None` if the index is out of bounds.
868    pub fn get(&self, index: usize) -> Option<i64> {
869        self.inner.get(index).copied()
870    }
871
872    /// Adds the values from the other histogram to this histogram
873    ///
874    /// # Panics
875    /// If the histograms have different lengths
876    pub fn add(&mut self, other: &Self) {
877        assert_eq!(self.len(), other.len());
878        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
879            *dst += src;
880        }
881    }
882
883    /// return the length of the histogram
884    pub fn len(&self) -> usize {
885        self.inner.len()
886    }
887
888    /// returns if the histogram is empty
889    pub fn is_empty(&self) -> bool {
890        self.inner.is_empty()
891    }
892
893    /// Sets the values of all histogram levels to 0.
894    pub fn reset(&mut self) {
895        for value in self.inner.iter_mut() {
896            *value = 0;
897        }
898    }
899
900    /// Updates histogram values using provided repetition levels
901    ///
902    /// # Panics
903    /// if any of the levels is greater than the length of the histogram (
904    /// the argument supplied to [`Self::try_new`])
905    pub fn update_from_levels(&mut self, levels: &[i16]) {
906        for &level in levels {
907            self.inner[level as usize] += 1;
908        }
909    }
910}
911
912impl From<Vec<i64>> for LevelHistogram {
913    fn from(inner: Vec<i64>) -> Self {
914        Self { inner }
915    }
916}
917
918impl From<LevelHistogram> for Vec<i64> {
919    fn from(value: LevelHistogram) -> Self {
920        value.into_inner()
921    }
922}
923
924impl HeapSize for LevelHistogram {
925    fn heap_size(&self) -> usize {
926        self.inner.heap_size()
927    }
928}
929
930/// Represents common operations for a column chunk.
931impl ColumnChunkMetaData {
932    /// Returns builder for column chunk metadata.
933    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
934        ColumnChunkMetaDataBuilder::new(column_descr)
935    }
936
937    /// File where the column chunk is stored.
938    ///
939    /// If not set, assumed to belong to the same file as the metadata.
940    /// This path is relative to the current file.
941    pub fn file_path(&self) -> Option<&str> {
942        self.file_path.as_deref()
943    }
944
945    /// Byte offset of `ColumnMetaData` in `file_path()`.
946    ///
947    /// Note that the meaning of this field has been inconsistent between implementations
948    /// so its use has since been deprecated in the Parquet specification. Modern implementations
949    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
950    /// `ColumnChunk` struct.
951    pub fn file_offset(&self) -> i64 {
952        self.file_offset
953    }
954
955    /// Type of this column. Must be primitive.
956    pub fn column_type(&self) -> Type {
957        self.column_descr.physical_type()
958    }
959
960    /// Path (or identifier) of this column.
961    pub fn column_path(&self) -> &ColumnPath {
962        self.column_descr.path()
963    }
964
965    /// Descriptor for this column.
966    pub fn column_descr(&self) -> &ColumnDescriptor {
967        self.column_descr.as_ref()
968    }
969
970    /// Reference counted clone of descriptor for this column.
971    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
972        self.column_descr.clone()
973    }
974
975    /// All encodings used for this column.
976    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
977        self.encodings.encodings()
978    }
979
980    /// All encodings used for this column, returned as a bitmask.
981    pub fn encodings_mask(&self) -> &EncodingMask {
982        &self.encodings
983    }
984
985    /// Total number of values in this column chunk.
986    pub fn num_values(&self) -> i64 {
987        self.num_values
988    }
989
990    /// Compression for this column.
991    pub fn compression(&self) -> Compression {
992        self.compression
993    }
994
995    /// Returns the total compressed data size of this column chunk.
996    pub fn compressed_size(&self) -> i64 {
997        self.total_compressed_size
998    }
999
1000    /// Returns the total uncompressed data size of this column chunk.
1001    pub fn uncompressed_size(&self) -> i64 {
1002        self.total_uncompressed_size
1003    }
1004
1005    /// Returns the offset for the column data.
1006    pub fn data_page_offset(&self) -> i64 {
1007        self.data_page_offset
1008    }
1009
1010    /// Returns the offset for the index page.
1011    pub fn index_page_offset(&self) -> Option<i64> {
1012        self.index_page_offset
1013    }
1014
1015    /// Returns the offset for the dictionary page, if any.
1016    pub fn dictionary_page_offset(&self) -> Option<i64> {
1017        self.dictionary_page_offset
1018    }
1019
1020    /// Returns the offset and length in bytes of the column chunk within the file
1021    pub fn byte_range(&self) -> (u64, u64) {
1022        let col_start = match self.dictionary_page_offset() {
1023            Some(dictionary_page_offset) => dictionary_page_offset,
1024            None => self.data_page_offset(),
1025        };
1026        let col_len = self.compressed_size();
1027        assert!(
1028            col_start >= 0 && col_len >= 0,
1029            "column start and length should not be negative"
1030        );
1031        (col_start as u64, col_len as u64)
1032    }
1033
1034    /// Returns statistics that are set for this column chunk,
1035    /// or `None` if no statistics are available.
1036    pub fn statistics(&self) -> Option<&Statistics> {
1037        self.statistics.as_ref()
1038    }
1039
1040    /// Returns geospatial statistics that are set for this column chunk,
1041    /// or `None` if no geospatial statistics are available.
1042    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1043        self.geo_statistics.as_deref()
1044    }
1045
1046    /// Returns the offset for the page encoding stats,
1047    /// or `None` if no page encoding stats are available.
1048    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1049        self.encoding_stats.as_ref()
1050    }
1051
1052    /// Returns the offset for the bloom filter.
1053    pub fn bloom_filter_offset(&self) -> Option<i64> {
1054        self.bloom_filter_offset
1055    }
1056
1057    /// Returns the offset for the bloom filter.
1058    pub fn bloom_filter_length(&self) -> Option<i32> {
1059        self.bloom_filter_length
1060    }
1061
1062    /// Returns the offset for the column index.
1063    pub fn column_index_offset(&self) -> Option<i64> {
1064        self.column_index_offset
1065    }
1066
1067    /// Returns the offset for the column index length.
1068    pub fn column_index_length(&self) -> Option<i32> {
1069        self.column_index_length
1070    }
1071
1072    /// Returns the range for the offset index if any
1073    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1074        let offset = u64::try_from(self.column_index_offset?).ok()?;
1075        let length = u64::try_from(self.column_index_length?).ok()?;
1076        Some(offset..(offset + length))
1077    }
1078
1079    /// Returns the offset for the offset index.
1080    pub fn offset_index_offset(&self) -> Option<i64> {
1081        self.offset_index_offset
1082    }
1083
1084    /// Returns the offset for the offset index length.
1085    pub fn offset_index_length(&self) -> Option<i32> {
1086        self.offset_index_length
1087    }
1088
1089    /// Returns the range for the offset index if any
1090    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1091        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1092        let length = u64::try_from(self.offset_index_length?).ok()?;
1093        Some(offset..(offset + length))
1094    }
1095
1096    /// Returns the number of bytes of variable length data after decoding.
1097    ///
1098    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1099    /// writers.
1100    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1101        self.unencoded_byte_array_data_bytes
1102    }
1103
1104    /// Returns the repetition level histogram.
1105    ///
1106    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1107    /// `vec[0]` indicates how many rows the page contains.
1108    /// This field may not be set by older writers.
1109    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1110        self.repetition_level_histogram.as_ref()
1111    }
1112
1113    /// Returns the definition level histogram.
1114    ///
1115    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1116    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1117    /// This field may not be set by older writers.
1118    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1119        self.definition_level_histogram.as_ref()
1120    }
1121
1122    /// Returns the encryption metadata for this column chunk.
1123    #[cfg(feature = "encryption")]
1124    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1125        self.column_crypto_metadata.as_deref()
1126    }
1127
1128    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1129    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1130        ColumnChunkMetaDataBuilder::from(self)
1131    }
1132}
1133
1134/// Builder for [`ColumnChunkMetaData`]
1135///
1136/// This builder is used to create a new column chunk metadata or modify an
1137/// existing one.
1138///
1139/// # Example
1140/// ```no_run
1141/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1142/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1143/// let column_chunk_metadata = get_column_chunk_metadata();
1144/// // create a new builder from existing column chunk metadata
1145/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1146/// // clear the statistics:
1147/// let column_chunk_metadata: ColumnChunkMetaData = builder
1148///   .clear_statistics()
1149///   .build()
1150///   .unwrap();
1151/// ```
1152pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1153
1154impl ColumnChunkMetaDataBuilder {
1155    /// Creates new column chunk metadata builder.
1156    ///
1157    /// See also [`ColumnChunkMetaData::builder`]
1158    fn new(column_descr: ColumnDescPtr) -> Self {
1159        Self(ColumnChunkMetaData {
1160            column_descr,
1161            encodings: Default::default(),
1162            file_path: None,
1163            file_offset: 0,
1164            num_values: 0,
1165            compression: Compression::UNCOMPRESSED,
1166            total_compressed_size: 0,
1167            total_uncompressed_size: 0,
1168            data_page_offset: 0,
1169            index_page_offset: None,
1170            dictionary_page_offset: None,
1171            statistics: None,
1172            geo_statistics: None,
1173            encoding_stats: None,
1174            bloom_filter_offset: None,
1175            bloom_filter_length: None,
1176            offset_index_offset: None,
1177            offset_index_length: None,
1178            column_index_offset: None,
1179            column_index_length: None,
1180            unencoded_byte_array_data_bytes: None,
1181            repetition_level_histogram: None,
1182            definition_level_histogram: None,
1183            #[cfg(feature = "encryption")]
1184            column_crypto_metadata: None,
1185            #[cfg(feature = "encryption")]
1186            encrypted_column_metadata: None,
1187        })
1188    }
1189
1190    /// Sets list of encodings for this column chunk.
1191    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1192        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1193        self
1194    }
1195
1196    /// Sets the encodings mask for this column chunk.
1197    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1198        self.0.encodings = encodings;
1199        self
1200    }
1201
1202    /// Sets optional file path for this column chunk.
1203    pub fn set_file_path(mut self, value: String) -> Self {
1204        self.0.file_path = Some(value);
1205        self
1206    }
1207
1208    /// Sets number of values.
1209    pub fn set_num_values(mut self, value: i64) -> Self {
1210        self.0.num_values = value;
1211        self
1212    }
1213
1214    /// Sets compression.
1215    pub fn set_compression(mut self, value: Compression) -> Self {
1216        self.0.compression = value;
1217        self
1218    }
1219
1220    /// Sets total compressed size in bytes.
1221    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1222        self.0.total_compressed_size = value;
1223        self
1224    }
1225
1226    /// Sets total uncompressed size in bytes.
1227    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1228        self.0.total_uncompressed_size = value;
1229        self
1230    }
1231
1232    /// Sets data page offset in bytes.
1233    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1234        self.0.data_page_offset = value;
1235        self
1236    }
1237
1238    /// Sets optional dictionary page offset in bytes.
1239    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1240        self.0.dictionary_page_offset = value;
1241        self
1242    }
1243
1244    /// Sets optional index page offset in bytes.
1245    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1246        self.0.index_page_offset = value;
1247        self
1248    }
1249
1250    /// Sets statistics for this column chunk.
1251    pub fn set_statistics(mut self, value: Statistics) -> Self {
1252        self.0.statistics = Some(value);
1253        self
1254    }
1255
1256    /// Sets geospatial statistics for this column chunk.
1257    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1258        self.0.geo_statistics = Some(value);
1259        self
1260    }
1261
1262    /// Clears the statistics for this column chunk.
1263    pub fn clear_statistics(mut self) -> Self {
1264        self.0.statistics = None;
1265        self
1266    }
1267
1268    /// Sets page encoding stats for this column chunk.
1269    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1270        self.0.encoding_stats = Some(value);
1271        self
1272    }
1273
1274    /// Clears the page encoding stats for this column chunk.
1275    pub fn clear_page_encoding_stats(mut self) -> Self {
1276        self.0.encoding_stats = None;
1277        self
1278    }
1279
1280    /// Sets optional bloom filter offset in bytes.
1281    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1282        self.0.bloom_filter_offset = value;
1283        self
1284    }
1285
1286    /// Sets optional bloom filter length in bytes.
1287    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1288        self.0.bloom_filter_length = value;
1289        self
1290    }
1291
1292    /// Sets optional offset index offset in bytes.
1293    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1294        self.0.offset_index_offset = value;
1295        self
1296    }
1297
1298    /// Sets optional offset index length in bytes.
1299    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1300        self.0.offset_index_length = value;
1301        self
1302    }
1303
1304    /// Sets optional column index offset in bytes.
1305    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1306        self.0.column_index_offset = value;
1307        self
1308    }
1309
1310    /// Sets optional column index length in bytes.
1311    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1312        self.0.column_index_length = value;
1313        self
1314    }
1315
1316    /// Sets optional length of variable length data in bytes.
1317    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1318        self.0.unencoded_byte_array_data_bytes = value;
1319        self
1320    }
1321
1322    /// Sets optional repetition level histogram
1323    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1324        self.0.repetition_level_histogram = value;
1325        self
1326    }
1327
1328    /// Sets optional repetition level histogram
1329    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1330        self.0.definition_level_histogram = value;
1331        self
1332    }
1333
1334    #[cfg(feature = "encryption")]
1335    /// Set the encryption metadata for an encrypted column
1336    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1337        self.0.column_crypto_metadata = value.map(Box::new);
1338        self
1339    }
1340
1341    #[cfg(feature = "encryption")]
1342    /// Set the encryption metadata for an encrypted column
1343    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1344        self.0.encrypted_column_metadata = value;
1345        self
1346    }
1347
1348    /// Builds column chunk metadata.
1349    pub fn build(self) -> Result<ColumnChunkMetaData> {
1350        Ok(self.0)
1351    }
1352}
1353
1354/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1355///
1356/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1357/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1358pub struct ColumnIndexBuilder {
1359    column_type: Type,
1360    null_pages: Vec<bool>,
1361    min_values: Vec<Vec<u8>>,
1362    max_values: Vec<Vec<u8>>,
1363    null_counts: Vec<i64>,
1364    boundary_order: BoundaryOrder,
1365    /// contains the concatenation of the histograms of all pages
1366    repetition_level_histograms: Option<Vec<i64>>,
1367    /// contains the concatenation of the histograms of all pages
1368    definition_level_histograms: Option<Vec<i64>>,
1369    /// Is the information in the builder valid?
1370    ///
1371    /// Set to `false` if any entry in the page doesn't have statistics for
1372    /// some reason, so statistics for that page won't be written to the file.
1373    /// This might happen if the page is entirely null, or
1374    /// is a floating point column without any non-nan values
1375    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1376    valid: bool,
1377}
1378
1379impl ColumnIndexBuilder {
1380    /// Creates a new column index builder.
1381    pub fn new(column_type: Type) -> Self {
1382        ColumnIndexBuilder {
1383            column_type,
1384            null_pages: Vec::new(),
1385            min_values: Vec::new(),
1386            max_values: Vec::new(),
1387            null_counts: Vec::new(),
1388            boundary_order: BoundaryOrder::UNORDERED,
1389            repetition_level_histograms: None,
1390            definition_level_histograms: None,
1391            valid: true,
1392        }
1393    }
1394
1395    /// Append statistics for the next page
1396    pub fn append(
1397        &mut self,
1398        null_page: bool,
1399        min_value: Vec<u8>,
1400        max_value: Vec<u8>,
1401        null_count: i64,
1402    ) {
1403        self.null_pages.push(null_page);
1404        self.min_values.push(min_value);
1405        self.max_values.push(max_value);
1406        self.null_counts.push(null_count);
1407    }
1408
1409    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1410    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1411    ///
1412    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1413    pub fn append_histograms(
1414        &mut self,
1415        repetition_level_histogram: &Option<LevelHistogram>,
1416        definition_level_histogram: &Option<LevelHistogram>,
1417    ) {
1418        if !self.valid {
1419            return;
1420        }
1421        if let Some(rep_lvl_hist) = repetition_level_histogram {
1422            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1423            hist.reserve(rep_lvl_hist.len());
1424            hist.extend(rep_lvl_hist.values());
1425        }
1426        if let Some(def_lvl_hist) = definition_level_histogram {
1427            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1428            hist.reserve(def_lvl_hist.len());
1429            hist.extend(def_lvl_hist.values());
1430        }
1431    }
1432
1433    /// Set the boundary order of the column index
1434    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1435        self.boundary_order = boundary_order;
1436    }
1437
1438    /// Mark this column index as invalid
1439    pub fn to_invalid(&mut self) {
1440        self.valid = false;
1441    }
1442
1443    /// Is the information in the builder valid?
1444    pub fn valid(&self) -> bool {
1445        self.valid
1446    }
1447
1448    /// Build and get the column index
1449    ///
1450    /// Note: callers should check [`Self::valid`] before calling this method
1451    pub fn build(self) -> Result<ColumnIndexMetaData> {
1452        Ok(match self.column_type {
1453            Type::BOOLEAN => {
1454                let index = self.build_page_index()?;
1455                ColumnIndexMetaData::BOOLEAN(index)
1456            }
1457            Type::INT32 => {
1458                let index = self.build_page_index()?;
1459                ColumnIndexMetaData::INT32(index)
1460            }
1461            Type::INT64 => {
1462                let index = self.build_page_index()?;
1463                ColumnIndexMetaData::INT64(index)
1464            }
1465            Type::INT96 => {
1466                let index = self.build_page_index()?;
1467                ColumnIndexMetaData::INT96(index)
1468            }
1469            Type::FLOAT => {
1470                let index = self.build_page_index()?;
1471                ColumnIndexMetaData::FLOAT(index)
1472            }
1473            Type::DOUBLE => {
1474                let index = self.build_page_index()?;
1475                ColumnIndexMetaData::DOUBLE(index)
1476            }
1477            Type::BYTE_ARRAY => {
1478                let index = self.build_byte_array_index()?;
1479                ColumnIndexMetaData::BYTE_ARRAY(index)
1480            }
1481            Type::FIXED_LEN_BYTE_ARRAY => {
1482                let index = self.build_byte_array_index()?;
1483                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1484            }
1485        })
1486    }
1487
1488    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1489    where
1490        T: ParquetValueType,
1491    {
1492        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1493        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1494
1495        PrimitiveColumnIndex::try_new(
1496            self.null_pages,
1497            self.boundary_order,
1498            Some(self.null_counts),
1499            self.repetition_level_histograms,
1500            self.definition_level_histograms,
1501            min_values,
1502            max_values,
1503        )
1504    }
1505
1506    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1507        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1508        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1509
1510        ByteArrayColumnIndex::try_new(
1511            self.null_pages,
1512            self.boundary_order,
1513            Some(self.null_counts),
1514            self.repetition_level_histograms,
1515            self.definition_level_histograms,
1516            min_values,
1517            max_values,
1518        )
1519    }
1520}
1521
1522impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1523    fn from(value: ColumnChunkMetaData) -> Self {
1524        ColumnChunkMetaDataBuilder(value)
1525    }
1526}
1527
1528/// Builder for offset index, part of the Parquet [PageIndex].
1529///
1530/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1531pub struct OffsetIndexBuilder {
1532    offset_array: Vec<i64>,
1533    compressed_page_size_array: Vec<i32>,
1534    first_row_index_array: Vec<i64>,
1535    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1536    current_first_row_index: i64,
1537}
1538
1539impl Default for OffsetIndexBuilder {
1540    fn default() -> Self {
1541        Self::new()
1542    }
1543}
1544
1545impl OffsetIndexBuilder {
1546    /// Creates a new offset index builder.
1547    pub fn new() -> Self {
1548        OffsetIndexBuilder {
1549            offset_array: Vec::new(),
1550            compressed_page_size_array: Vec::new(),
1551            first_row_index_array: Vec::new(),
1552            unencoded_byte_array_data_bytes_array: None,
1553            current_first_row_index: 0,
1554        }
1555    }
1556
1557    /// Append the row count of the next page.
1558    pub fn append_row_count(&mut self, row_count: i64) {
1559        let current_page_row_index = self.current_first_row_index;
1560        self.first_row_index_array.push(current_page_row_index);
1561        self.current_first_row_index += row_count;
1562    }
1563
1564    /// Append the offset and size of the next page.
1565    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1566        self.offset_array.push(offset);
1567        self.compressed_page_size_array.push(compressed_page_size);
1568    }
1569
1570    /// Append the unencoded byte array data bytes of the next page.
1571    pub fn append_unencoded_byte_array_data_bytes(
1572        &mut self,
1573        unencoded_byte_array_data_bytes: Option<i64>,
1574    ) {
1575        if let Some(val) = unencoded_byte_array_data_bytes {
1576            self.unencoded_byte_array_data_bytes_array
1577                .get_or_insert(Vec::new())
1578                .push(val);
1579        }
1580    }
1581
1582    /// Build and get the thrift metadata of offset index
1583    pub fn build(self) -> OffsetIndexMetaData {
1584        let locations = self
1585            .offset_array
1586            .iter()
1587            .zip(self.compressed_page_size_array.iter())
1588            .zip(self.first_row_index_array.iter())
1589            .map(|((offset, size), row_index)| PageLocation {
1590                offset: *offset,
1591                compressed_page_size: *size,
1592                first_row_index: *row_index,
1593            })
1594            .collect::<Vec<_>>();
1595        OffsetIndexMetaData {
1596            page_locations: locations,
1597            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1598        }
1599    }
1600}
1601
1602#[cfg(test)]
1603mod tests {
1604    use super::*;
1605    use crate::basic::{PageType, SortOrder};
1606    use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1607
1608    #[test]
1609    fn test_row_group_metadata_thrift_conversion() {
1610        let schema_descr = get_test_schema_descr();
1611
1612        let mut columns = vec![];
1613        for ptr in schema_descr.columns() {
1614            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1615            columns.push(column);
1616        }
1617        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1618            .set_num_rows(1000)
1619            .set_total_byte_size(2000)
1620            .set_column_metadata(columns)
1621            .set_ordinal(1)
1622            .build()
1623            .unwrap();
1624
1625        let mut buf = Vec::new();
1626        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1627        row_group_meta.write_thrift(&mut writer).unwrap();
1628
1629        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1630
1631        assert_eq!(row_group_res, row_group_meta);
1632    }
1633
1634    #[test]
1635    fn test_row_group_metadata_thrift_conversion_empty() {
1636        let schema_descr = get_test_schema_descr();
1637
1638        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1639
1640        assert!(row_group_meta.is_err());
1641        if let Err(e) = row_group_meta {
1642            assert_eq!(
1643                format!("{e}"),
1644                "Parquet error: Column length mismatch: 2 != 0"
1645            );
1646        }
1647    }
1648
1649    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1650    #[test]
1651    fn test_row_group_metadata_thrift_corrupted() {
1652        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1653            SchemaType::group_type_builder("schema")
1654                .with_fields(vec![
1655                    Arc::new(
1656                        SchemaType::primitive_type_builder("a", Type::INT32)
1657                            .build()
1658                            .unwrap(),
1659                    ),
1660                    Arc::new(
1661                        SchemaType::primitive_type_builder("b", Type::INT32)
1662                            .build()
1663                            .unwrap(),
1664                    ),
1665                ])
1666                .build()
1667                .unwrap(),
1668        )));
1669
1670        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1671            SchemaType::group_type_builder("schema")
1672                .with_fields(vec![
1673                    Arc::new(
1674                        SchemaType::primitive_type_builder("a", Type::INT32)
1675                            .build()
1676                            .unwrap(),
1677                    ),
1678                    Arc::new(
1679                        SchemaType::primitive_type_builder("b", Type::INT32)
1680                            .build()
1681                            .unwrap(),
1682                    ),
1683                    Arc::new(
1684                        SchemaType::primitive_type_builder("c", Type::INT32)
1685                            .build()
1686                            .unwrap(),
1687                    ),
1688                ])
1689                .build()
1690                .unwrap(),
1691        )));
1692
1693        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1694            .set_num_rows(1000)
1695            .set_total_byte_size(2000)
1696            .set_column_metadata(vec![
1697                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1698                    .build()
1699                    .unwrap(),
1700                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1701                    .build()
1702                    .unwrap(),
1703            ])
1704            .set_ordinal(1)
1705            .build()
1706            .unwrap();
1707        let mut buf = Vec::new();
1708        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1709        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1710
1711        let err = read_row_group(&mut buf, schema_descr_3cols)
1712            .unwrap_err()
1713            .to_string();
1714        assert_eq!(
1715            err,
1716            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1717        );
1718    }
1719
1720    #[test]
1721    fn test_column_chunk_metadata_thrift_conversion() {
1722        let column_descr = get_test_schema_descr().column(0);
1723        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1724            .set_encodings_mask(EncodingMask::new_from_encodings(
1725                [Encoding::PLAIN, Encoding::RLE].iter(),
1726            ))
1727            .set_file_path("file_path".to_owned())
1728            .set_num_values(1000)
1729            .set_compression(Compression::SNAPPY)
1730            .set_total_compressed_size(2000)
1731            .set_total_uncompressed_size(3000)
1732            .set_data_page_offset(4000)
1733            .set_dictionary_page_offset(Some(5000))
1734            .set_page_encoding_stats(vec![
1735                PageEncodingStats {
1736                    page_type: PageType::DATA_PAGE,
1737                    encoding: Encoding::PLAIN,
1738                    count: 3,
1739                },
1740                PageEncodingStats {
1741                    page_type: PageType::DATA_PAGE,
1742                    encoding: Encoding::RLE,
1743                    count: 5,
1744                },
1745            ])
1746            .set_bloom_filter_offset(Some(6000))
1747            .set_bloom_filter_length(Some(25))
1748            .set_offset_index_offset(Some(7000))
1749            .set_offset_index_length(Some(25))
1750            .set_column_index_offset(Some(8000))
1751            .set_column_index_length(Some(25))
1752            .set_unencoded_byte_array_data_bytes(Some(2000))
1753            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1754            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1755            .build()
1756            .unwrap();
1757
1758        let mut buf = Vec::new();
1759        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1760        col_metadata.write_thrift(&mut writer).unwrap();
1761        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1762
1763        assert_eq!(col_chunk_res, col_metadata);
1764    }
1765
1766    #[test]
1767    fn test_column_chunk_metadata_thrift_conversion_empty() {
1768        let column_descr = get_test_schema_descr().column(0);
1769
1770        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1771            .build()
1772            .unwrap();
1773
1774        let mut buf = Vec::new();
1775        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1776        col_metadata.write_thrift(&mut writer).unwrap();
1777        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1778
1779        assert_eq!(col_chunk_res, col_metadata);
1780    }
1781
1782    #[test]
1783    fn test_compressed_size() {
1784        let schema_descr = get_test_schema_descr();
1785
1786        let mut columns = vec![];
1787        for column_descr in schema_descr.columns() {
1788            let column = ColumnChunkMetaData::builder(column_descr.clone())
1789                .set_total_compressed_size(500)
1790                .set_total_uncompressed_size(700)
1791                .build()
1792                .unwrap();
1793            columns.push(column);
1794        }
1795        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1796            .set_num_rows(1000)
1797            .set_column_metadata(columns)
1798            .build()
1799            .unwrap();
1800
1801        let compressed_size_res: i64 = row_group_meta.compressed_size();
1802        let compressed_size_exp: i64 = 1000;
1803
1804        assert_eq!(compressed_size_res, compressed_size_exp);
1805    }
1806
1807    #[test]
1808    fn test_memory_size() {
1809        let schema_descr = get_test_schema_descr();
1810
1811        let columns = schema_descr
1812            .columns()
1813            .iter()
1814            .map(|column_descr| {
1815                ColumnChunkMetaData::builder(column_descr.clone())
1816                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1817                    .build()
1818            })
1819            .collect::<Result<Vec<_>>>()
1820            .unwrap();
1821        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1822            .set_num_rows(1000)
1823            .set_column_metadata(columns)
1824            .build()
1825            .unwrap();
1826        let row_group_meta = vec![row_group_meta];
1827
1828        let version = 2;
1829        let num_rows = 1000;
1830        let created_by = Some(String::from("test harness"));
1831        let key_value_metadata = Some(vec![KeyValue::new(
1832            String::from("Foo"),
1833            Some(String::from("bar")),
1834        )]);
1835        let column_orders = Some(vec![
1836            ColumnOrder::UNDEFINED,
1837            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1838        ]);
1839        let file_metadata = FileMetaData::new(
1840            version,
1841            num_rows,
1842            created_by,
1843            key_value_metadata,
1844            schema_descr.clone(),
1845            column_orders,
1846        );
1847
1848        // Now, add in Exact Statistics
1849        let columns_with_stats = schema_descr
1850            .columns()
1851            .iter()
1852            .map(|column_descr| {
1853                ColumnChunkMetaData::builder(column_descr.clone())
1854                    .set_statistics(Statistics::new::<i32>(
1855                        Some(0),
1856                        Some(100),
1857                        None,
1858                        None,
1859                        false,
1860                    ))
1861                    .build()
1862            })
1863            .collect::<Result<Vec<_>>>()
1864            .unwrap();
1865
1866        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1867            .set_num_rows(1000)
1868            .set_column_metadata(columns_with_stats)
1869            .build()
1870            .unwrap();
1871        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1872
1873        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1874            .set_row_groups(row_group_meta_with_stats)
1875            .build();
1876
1877        #[cfg(not(feature = "encryption"))]
1878        let base_expected_size = 2248;
1879        #[cfg(feature = "encryption")]
1880        // Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472
1881        let base_expected_size = 2416;
1882
1883        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1884
1885        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
1886        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1887        let column_index = column_index.build().unwrap();
1888        let native_index = match column_index {
1889            ColumnIndexMetaData::BOOLEAN(index) => index,
1890            _ => panic!("wrong type of column index"),
1891        };
1892
1893        // Now, add in OffsetIndex
1894        let mut offset_index = OffsetIndexBuilder::new();
1895        offset_index.append_row_count(1);
1896        offset_index.append_offset_and_size(2, 3);
1897        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1898        offset_index.append_row_count(1);
1899        offset_index.append_offset_and_size(2, 3);
1900        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1901        let offset_index = offset_index.build();
1902
1903        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1904            .set_row_groups(row_group_meta)
1905            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
1906            .set_offset_index(Some(vec![vec![offset_index]]))
1907            .build();
1908
1909        #[cfg(not(feature = "encryption"))]
1910        let bigger_expected_size = 2674;
1911        #[cfg(feature = "encryption")]
1912        // Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472
1913        let bigger_expected_size = 2842;
1914
1915        // more set fields means more memory usage
1916        assert!(bigger_expected_size > base_expected_size);
1917        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1918    }
1919
1920    /// Returns sample schema descriptor so we can create column metadata.
1921    fn get_test_schema_descr() -> SchemaDescPtr {
1922        let schema = SchemaType::group_type_builder("schema")
1923            .with_fields(vec![
1924                Arc::new(
1925                    SchemaType::primitive_type_builder("a", Type::INT32)
1926                        .build()
1927                        .unwrap(),
1928                ),
1929                Arc::new(
1930                    SchemaType::primitive_type_builder("b", Type::INT32)
1931                        .build()
1932                        .unwrap(),
1933                ),
1934            ])
1935            .build()
1936            .unwrap();
1937
1938        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
1939    }
1940}