parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//! # Examples
46//!
47//! Please see [`external_metadata.rs`]
48//!
49//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
50//!
51//! # Metadata Encodings and Structures
52//!
53//! There are three different encodings of Parquet Metadata in this crate:
54//!
55//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
56//!    [parquet.thrift]
57//!
58//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
59//!    from [parquet.thrift]. These structures are low level and mirror
60//!    the thrift definitions.
61//!
62//! 3. [`file::metadata`] (this module): Easier to use Rust structures
63//!    with a more idiomatic API. Note that, confusingly, some but not all
64//!    of these structures have the same name as the [`format`] structures.
65//!
66//! [`file::metadata`]: crate::file::metadata
67//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
68//!
69//! Graphically, this is how the different structures relate to each other:
70//!
71//! ```text
72//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
73//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
74//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
75//!                            └──────────────┘     │         └───────────────────────┘ │
76//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
77//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
78//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
79//!                                     ...         │                   ...             │
80//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
81//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
82//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
83//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
84//!
85//!                          format::meta structures          file::metadata structures
86//!
87//!                         * Same name, different struct
88//! ```
89mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112    Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116    basic::BoundaryOrder,
117    errors::{ParquetError, Result},
118};
119use crate::{
120    basic::{ColumnOrder, Compression, Encoding, Type},
121    parquet_thrift::{
122        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124    },
125};
126use crate::{
127    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
132pub use push_decoder::ParquetMetaDataPushDecoder;
133pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
134use std::io::Write;
135use std::ops::Range;
136use std::sync::Arc;
137pub use writer::ParquetMetaDataWriter;
138pub(crate) use writer::ThriftMetadataWriter;
139
140/// Page level statistics for each column chunk of each row group.
141///
142/// This structure is an in-memory representation of multiple [`ColumnIndex`]
143/// structures in a parquet file footer, as described in the Parquet [PageIndex
144/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
145/// particular column chunk.
146///
147/// `column_index[row_group_number][column_number]` holds the
148/// [`ColumnIndex`] corresponding to column `column_number` of row group
149/// `row_group_number`.
150///
151/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
152/// column in the third row group of the parquet file.
153///
154/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
155/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
156pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
157
158/// [`OffsetIndexMetaData`] for each data page of each row group of each column
159///
160/// This structure is the parsed representation of the [`OffsetIndex`] from the
161/// Parquet file footer, as described in the Parquet [PageIndex documentation].
162///
163/// `offset_index[row_group_number][column_number]` holds
164/// the [`OffsetIndexMetaData`] corresponding to column
165/// `column_number`of row group `row_group_number`.
166///
167/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
168/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
169pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
170
171/// Parsed metadata for a single Parquet file
172///
173/// This structure is stored in the footer of Parquet files, in the format
174/// defined by [`parquet.thrift`].
175///
176/// # Overview
177/// The fields of this structure are:
178/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
179/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
180/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
181///
182/// This structure is read by the various readers in this crate or can be read
183/// directly from a file using the [`ParquetMetaDataReader`] struct.
184///
185/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
186///
187/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
188#[derive(Debug, Clone, PartialEq)]
189pub struct ParquetMetaData {
190    /// File level metadata
191    file_metadata: FileMetaData,
192    /// Row group metadata
193    row_groups: Vec<RowGroupMetaData>,
194    /// Page level index for each page in each column chunk
195    column_index: Option<ParquetColumnIndex>,
196    /// Offset index for each page in each column chunk
197    offset_index: Option<ParquetOffsetIndex>,
198    /// Optional file decryptor
199    #[cfg(feature = "encryption")]
200    file_decryptor: Option<Box<FileDecryptor>>,
201}
202
203impl ParquetMetaData {
204    /// Creates Parquet metadata from file metadata and a list of row
205    /// group metadata
206    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
207        ParquetMetaData {
208            file_metadata,
209            row_groups,
210            column_index: None,
211            offset_index: None,
212            #[cfg(feature = "encryption")]
213            file_decryptor: None,
214        }
215    }
216
217    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
218    /// encrypted data.
219    #[cfg(feature = "encryption")]
220    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
221        self.file_decryptor = file_decryptor.map(Box::new);
222    }
223
224    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
225    pub fn into_builder(self) -> ParquetMetaDataBuilder {
226        self.into()
227    }
228
229    /// Returns file metadata as reference.
230    pub fn file_metadata(&self) -> &FileMetaData {
231        &self.file_metadata
232    }
233
234    /// Returns file decryptor as reference.
235    #[cfg(feature = "encryption")]
236    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
237        self.file_decryptor.as_deref()
238    }
239
240    /// Returns number of row groups in this file.
241    pub fn num_row_groups(&self) -> usize {
242        self.row_groups.len()
243    }
244
245    /// Returns row group metadata for `i`th position.
246    /// Position should be less than number of row groups `num_row_groups`.
247    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
248        &self.row_groups[i]
249    }
250
251    /// Returns slice of row groups in this file.
252    pub fn row_groups(&self) -> &[RowGroupMetaData] {
253        &self.row_groups
254    }
255
256    /// Returns the column index for this file if loaded
257    ///
258    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
259    /// [ArrowReaderOptions::with_page_index] was set to false.
260    ///
261    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
262    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
263        self.column_index.as_ref()
264    }
265
266    /// Returns offset indexes in this file, if loaded
267    ///
268    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
269    /// [ArrowReaderOptions::with_page_index] was set to false.
270    ///
271    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
272    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
273        self.offset_index.as_ref()
274    }
275
276    /// Estimate of the bytes allocated to store `ParquetMetadata`
277    ///
278    /// # Notes:
279    ///
280    /// 1. Includes size of self
281    ///
282    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
283    ///    [`RowGroupMetaData`].
284    ///
285    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
286    ///    means `memory_size` will over estimate the memory size if such pointers
287    ///    are shared.
288    ///
289    /// 4. Does not include any allocator overheads
290    pub fn memory_size(&self) -> usize {
291        #[cfg(feature = "encryption")]
292        let encryption_size = self.file_decryptor.heap_size();
293        #[cfg(not(feature = "encryption"))]
294        let encryption_size = 0usize;
295
296        std::mem::size_of::<Self>()
297            + self.file_metadata.heap_size()
298            + self.row_groups.heap_size()
299            + self.column_index.heap_size()
300            + self.offset_index.heap_size()
301            + encryption_size
302    }
303
304    /// Override the column index
305    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
306        self.column_index = index;
307    }
308
309    /// Override the offset index
310    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
311        self.offset_index = index;
312    }
313}
314
315/// A builder for creating / manipulating [`ParquetMetaData`]
316///
317/// # Example creating a new [`ParquetMetaData`]
318///
319///```no_run
320/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
321/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
322/// // Create a new builder given the file metadata
323/// let file_metadata = get_file_metadata();
324/// // Create a row group
325/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
326///    .set_num_rows(100)
327///    // ... (A real row group needs more than just the number of rows)
328///    .build()
329///    .unwrap();
330/// // Create the final metadata
331/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
332///   .add_row_group(row_group)
333///   .build();
334/// ```
335///
336/// # Example modifying an existing [`ParquetMetaData`]
337/// ```no_run
338/// # use parquet::file::metadata::ParquetMetaData;
339/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
340/// // Modify the metadata so only the last RowGroup remains
341/// let metadata: ParquetMetaData = load_metadata();
342/// let mut builder = metadata.into_builder();
343///
344/// // Take existing row groups to modify
345/// let mut row_groups = builder.take_row_groups();
346/// let last_row_group = row_groups.pop().unwrap();
347///
348/// let metadata = builder
349///   .add_row_group(last_row_group)
350///   .build();
351/// ```
352pub struct ParquetMetaDataBuilder(ParquetMetaData);
353
354impl ParquetMetaDataBuilder {
355    /// Create a new builder from a file metadata, with no row groups
356    pub fn new(file_meta_data: FileMetaData) -> Self {
357        Self(ParquetMetaData::new(file_meta_data, vec![]))
358    }
359
360    /// Create a new builder from an existing ParquetMetaData
361    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
362        Self(metadata)
363    }
364
365    /// Adds a row group to the metadata
366    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
367        self.0.row_groups.push(row_group);
368        self
369    }
370
371    /// Sets all the row groups to the specified list
372    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
373        self.0.row_groups = row_groups;
374        self
375    }
376
377    /// Takes ownership of the row groups in this builder, and clears the list
378    /// of row groups.
379    ///
380    /// This can be used for more efficient creation of a new ParquetMetaData
381    /// from an existing one.
382    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
383        std::mem::take(&mut self.0.row_groups)
384    }
385
386    /// Return a reference to the current row groups
387    pub fn row_groups(&self) -> &[RowGroupMetaData] {
388        &self.0.row_groups
389    }
390
391    /// Sets the column index
392    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
393        self.0.column_index = column_index;
394        self
395    }
396
397    /// Returns the current column index from the builder, replacing it with `None`
398    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
399        std::mem::take(&mut self.0.column_index)
400    }
401
402    /// Return a reference to the current column index, if any
403    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
404        self.0.column_index.as_ref()
405    }
406
407    /// Sets the offset index
408    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
409        self.0.offset_index = offset_index;
410        self
411    }
412
413    /// Returns the current offset index from the builder, replacing it with `None`
414    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
415        std::mem::take(&mut self.0.offset_index)
416    }
417
418    /// Return a reference to the current offset index, if any
419    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
420        self.0.offset_index.as_ref()
421    }
422
423    /// Sets the file decryptor needed to decrypt this metadata.
424    #[cfg(feature = "encryption")]
425    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
426        self.0.with_file_decryptor(file_decryptor);
427        self
428    }
429
430    /// Creates a new ParquetMetaData from the builder
431    pub fn build(self) -> ParquetMetaData {
432        let Self(metadata) = self;
433        metadata
434    }
435}
436
437impl From<ParquetMetaData> for ParquetMetaDataBuilder {
438    fn from(meta_data: ParquetMetaData) -> Self {
439        Self(meta_data)
440    }
441}
442
443thrift_struct!(
444/// A key-value pair for [`FileMetaData`].
445pub struct KeyValue {
446  1: required string key
447  2: optional string value
448}
449);
450
451impl KeyValue {
452    /// Create a new key value pair
453    pub fn new<F2>(key: String, value: F2) -> KeyValue
454    where
455        F2: Into<Option<String>>,
456    {
457        KeyValue {
458            key,
459            value: value.into(),
460        }
461    }
462}
463
464thrift_struct!(
465/// PageEncodingStats for a column chunk and data page.
466pub struct PageEncodingStats {
467  1: required PageType page_type;
468  2: required Encoding encoding;
469  3: required i32 count;
470}
471);
472
473/// Internal representation of the page encoding stats in the [`ColumnChunkMetaData`].
474/// This is not publicly exposed, with different getters defined for each variant.
475#[derive(Debug, Clone, PartialEq)]
476enum ParquetPageEncodingStats {
477    /// The full array of stats as defined in the Parquet spec.
478    Full(Vec<PageEncodingStats>),
479    /// A condensed version of only page encodings seen.
480    Mask(EncodingMask),
481}
482
483/// Reference counted pointer for [`FileMetaData`].
484pub type FileMetaDataPtr = Arc<FileMetaData>;
485
486/// File level metadata for a Parquet file.
487///
488/// Includes the version of the file, metadata, number of rows, schema, and column orders
489#[derive(Debug, Clone, PartialEq)]
490pub struct FileMetaData {
491    version: i32,
492    num_rows: i64,
493    created_by: Option<String>,
494    key_value_metadata: Option<Vec<KeyValue>>,
495    schema_descr: SchemaDescPtr,
496    column_orders: Option<Vec<ColumnOrder>>,
497    #[cfg(feature = "encryption")]
498    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
499    #[cfg(feature = "encryption")]
500    footer_signing_key_metadata: Option<Vec<u8>>,
501}
502
503impl FileMetaData {
504    /// Creates new file metadata.
505    pub fn new(
506        version: i32,
507        num_rows: i64,
508        created_by: Option<String>,
509        key_value_metadata: Option<Vec<KeyValue>>,
510        schema_descr: SchemaDescPtr,
511        column_orders: Option<Vec<ColumnOrder>>,
512    ) -> Self {
513        FileMetaData {
514            version,
515            num_rows,
516            created_by,
517            key_value_metadata,
518            schema_descr,
519            column_orders,
520            #[cfg(feature = "encryption")]
521            encryption_algorithm: None,
522            #[cfg(feature = "encryption")]
523            footer_signing_key_metadata: None,
524        }
525    }
526
527    #[cfg(feature = "encryption")]
528    pub(crate) fn with_encryption_algorithm(
529        mut self,
530        encryption_algorithm: Option<EncryptionAlgorithm>,
531    ) -> Self {
532        self.encryption_algorithm = encryption_algorithm.map(Box::new);
533        self
534    }
535
536    #[cfg(feature = "encryption")]
537    pub(crate) fn with_footer_signing_key_metadata(
538        mut self,
539        footer_signing_key_metadata: Option<Vec<u8>>,
540    ) -> Self {
541        self.footer_signing_key_metadata = footer_signing_key_metadata;
542        self
543    }
544
545    /// Returns version of this file.
546    pub fn version(&self) -> i32 {
547        self.version
548    }
549
550    /// Returns number of rows in the file.
551    pub fn num_rows(&self) -> i64 {
552        self.num_rows
553    }
554
555    /// String message for application that wrote this file.
556    ///
557    /// This should have the following format:
558    /// `<application> version <application version> (build <application build hash>)`.
559    ///
560    /// ```shell
561    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
562    /// ```
563    pub fn created_by(&self) -> Option<&str> {
564        self.created_by.as_deref()
565    }
566
567    /// Returns key_value_metadata of this file.
568    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
569        self.key_value_metadata.as_ref()
570    }
571
572    /// Returns Parquet [`Type`] that describes schema in this file.
573    ///
574    /// [`Type`]: crate::schema::types::Type
575    pub fn schema(&self) -> &SchemaType {
576        self.schema_descr.root_schema()
577    }
578
579    /// Returns a reference to schema descriptor.
580    pub fn schema_descr(&self) -> &SchemaDescriptor {
581        &self.schema_descr
582    }
583
584    /// Returns reference counted clone for schema descriptor.
585    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
586        self.schema_descr.clone()
587    }
588
589    /// Column (sort) order used for `min` and `max` values of each column in this file.
590    ///
591    /// Each column order corresponds to one column, determined by its position in the
592    /// list, matching the position of the column in the schema.
593    ///
594    /// When `None` is returned, there are no column orders available, and each column
595    /// should be assumed to have undefined (legacy) column order.
596    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
597        self.column_orders.as_ref()
598    }
599
600    /// Returns column order for `i`th column in this file.
601    /// If column orders are not available, returns undefined (legacy) column order.
602    pub fn column_order(&self, i: usize) -> ColumnOrder {
603        self.column_orders
604            .as_ref()
605            .map(|data| data[i])
606            .unwrap_or(ColumnOrder::UNDEFINED)
607    }
608}
609
610thrift_struct!(
611/// Sort order within a RowGroup of a leaf column
612pub struct SortingColumn {
613  /// The ordinal position of the column (in this row group)
614  1: required i32 column_idx
615
616  /// If true, indicates this column is sorted in descending order.
617  2: required bool descending
618
619  /// If true, nulls will come before non-null values, otherwise,
620  /// nulls go at the end. */
621  3: required bool nulls_first
622}
623);
624
625/// Reference counted pointer for [`RowGroupMetaData`].
626pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
627
628/// Metadata for a row group
629///
630/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
631/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
632#[derive(Debug, Clone, PartialEq)]
633pub struct RowGroupMetaData {
634    columns: Vec<ColumnChunkMetaData>,
635    num_rows: i64,
636    sorting_columns: Option<Vec<SortingColumn>>,
637    total_byte_size: i64,
638    schema_descr: SchemaDescPtr,
639    /// We can't infer from file offset of first column since there may empty columns in row group.
640    file_offset: Option<i64>,
641    /// Ordinal position of this row group in file
642    ordinal: Option<i16>,
643}
644
645impl RowGroupMetaData {
646    /// Returns builder for row group metadata.
647    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
648        RowGroupMetaDataBuilder::new(schema_descr)
649    }
650
651    /// Number of columns in this row group.
652    pub fn num_columns(&self) -> usize {
653        self.columns.len()
654    }
655
656    /// Returns column chunk metadata for `i`th column.
657    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
658        &self.columns[i]
659    }
660
661    /// Returns slice of column chunk metadata.
662    pub fn columns(&self) -> &[ColumnChunkMetaData] {
663        &self.columns
664    }
665
666    /// Returns mutable slice of column chunk metadata.
667    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
668        &mut self.columns
669    }
670
671    /// Number of rows in this row group.
672    pub fn num_rows(&self) -> i64 {
673        self.num_rows
674    }
675
676    /// Returns the sort ordering of the rows in this RowGroup if any
677    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
678        self.sorting_columns.as_ref()
679    }
680
681    /// Total byte size of all uncompressed column data in this row group.
682    pub fn total_byte_size(&self) -> i64 {
683        self.total_byte_size
684    }
685
686    /// Total size of all compressed column data in this row group.
687    pub fn compressed_size(&self) -> i64 {
688        self.columns.iter().map(|c| c.total_compressed_size).sum()
689    }
690
691    /// Returns reference to a schema descriptor.
692    pub fn schema_descr(&self) -> &SchemaDescriptor {
693        self.schema_descr.as_ref()
694    }
695
696    /// Returns reference counted clone of schema descriptor.
697    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
698        self.schema_descr.clone()
699    }
700
701    /// Returns ordinal position of this row group in file.
702    ///
703    /// For example if this is the first row group in the file, this will return 0.
704    /// If this is the second row group in the file, this will return 1.
705    #[inline(always)]
706    pub fn ordinal(&self) -> Option<i16> {
707        self.ordinal
708    }
709
710    /// Returns file offset of this row group in file.
711    #[inline(always)]
712    pub fn file_offset(&self) -> Option<i64> {
713        self.file_offset
714    }
715
716    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
717    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
718        RowGroupMetaDataBuilder(self)
719    }
720}
721
722/// Builder for row group metadata.
723pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
724
725impl RowGroupMetaDataBuilder {
726    /// Creates new builder from schema descriptor.
727    fn new(schema_descr: SchemaDescPtr) -> Self {
728        Self(RowGroupMetaData {
729            columns: Vec::with_capacity(schema_descr.num_columns()),
730            schema_descr,
731            file_offset: None,
732            num_rows: 0,
733            sorting_columns: None,
734            total_byte_size: 0,
735            ordinal: None,
736        })
737    }
738
739    /// Sets number of rows in this row group.
740    pub fn set_num_rows(mut self, value: i64) -> Self {
741        self.0.num_rows = value;
742        self
743    }
744
745    /// Sets the sorting order for columns
746    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
747        self.0.sorting_columns = value;
748        self
749    }
750
751    /// Sets total size in bytes for this row group.
752    pub fn set_total_byte_size(mut self, value: i64) -> Self {
753        self.0.total_byte_size = value;
754        self
755    }
756
757    /// Takes ownership of the the column metadata in this builder, and clears
758    /// the list of columns.
759    ///
760    /// This can be used for more efficient creation of a new RowGroupMetaData
761    /// from an existing one.
762    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
763        std::mem::take(&mut self.0.columns)
764    }
765
766    /// Sets column metadata for this row group.
767    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
768        self.0.columns = value;
769        self
770    }
771
772    /// Adds a column metadata to this row group
773    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
774        self.0.columns.push(value);
775        self
776    }
777
778    /// Sets ordinal for this row group.
779    pub fn set_ordinal(mut self, value: i16) -> Self {
780        self.0.ordinal = Some(value);
781        self
782    }
783
784    /// Sets file offset for this row group.
785    pub fn set_file_offset(mut self, value: i64) -> Self {
786        self.0.file_offset = Some(value);
787        self
788    }
789
790    /// Builds row group metadata.
791    pub fn build(self) -> Result<RowGroupMetaData> {
792        if self.0.schema_descr.num_columns() != self.0.columns.len() {
793            return Err(general_err!(
794                "Column length mismatch: {} != {}",
795                self.0.schema_descr.num_columns(),
796                self.0.columns.len()
797            ));
798        }
799
800        Ok(self.0)
801    }
802
803    /// Build row group metadata without validation.
804    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
805        self.0
806    }
807}
808
809/// Metadata for a column chunk.
810#[derive(Debug, Clone, PartialEq)]
811pub struct ColumnChunkMetaData {
812    column_descr: ColumnDescPtr,
813    encodings: EncodingMask,
814    file_path: Option<String>,
815    file_offset: i64,
816    num_values: i64,
817    compression: Compression,
818    total_compressed_size: i64,
819    total_uncompressed_size: i64,
820    data_page_offset: i64,
821    index_page_offset: Option<i64>,
822    dictionary_page_offset: Option<i64>,
823    statistics: Option<Statistics>,
824    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
825    encoding_stats: Option<ParquetPageEncodingStats>,
826    bloom_filter_offset: Option<i64>,
827    bloom_filter_length: Option<i32>,
828    offset_index_offset: Option<i64>,
829    offset_index_length: Option<i32>,
830    column_index_offset: Option<i64>,
831    column_index_length: Option<i32>,
832    unencoded_byte_array_data_bytes: Option<i64>,
833    repetition_level_histogram: Option<LevelHistogram>,
834    definition_level_histogram: Option<LevelHistogram>,
835    #[cfg(feature = "encryption")]
836    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
837    #[cfg(feature = "encryption")]
838    encrypted_column_metadata: Option<Vec<u8>>,
839}
840
841/// Histograms for repetition and definition levels.
842///
843/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
844/// values at level `i`.
845///
846/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
847/// number of rows with level 1, and so on.
848///
849#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
850pub struct LevelHistogram {
851    inner: Vec<i64>,
852}
853
854impl LevelHistogram {
855    /// Creates a new level histogram data.
856    ///
857    /// Length will be `max_level + 1`.
858    ///
859    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
860    pub fn try_new(max_level: i16) -> Option<Self> {
861        if max_level > 0 {
862            Some(Self {
863                inner: vec![0; max_level as usize + 1],
864            })
865        } else {
866            None
867        }
868    }
869    /// Returns a reference to the the histogram's values.
870    pub fn values(&self) -> &[i64] {
871        &self.inner
872    }
873
874    /// Return the inner vector, consuming self
875    pub fn into_inner(self) -> Vec<i64> {
876        self.inner
877    }
878
879    /// Returns the histogram value at the given index.
880    ///
881    /// The value of `i` is the number of values with level `i`. For example,
882    /// `get(1)` returns the number of values with level 1.
883    ///
884    /// Returns `None` if the index is out of bounds.
885    pub fn get(&self, index: usize) -> Option<i64> {
886        self.inner.get(index).copied()
887    }
888
889    /// Adds the values from the other histogram to this histogram
890    ///
891    /// # Panics
892    /// If the histograms have different lengths
893    pub fn add(&mut self, other: &Self) {
894        assert_eq!(self.len(), other.len());
895        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
896            *dst += src;
897        }
898    }
899
900    /// return the length of the histogram
901    pub fn len(&self) -> usize {
902        self.inner.len()
903    }
904
905    /// returns if the histogram is empty
906    pub fn is_empty(&self) -> bool {
907        self.inner.is_empty()
908    }
909
910    /// Sets the values of all histogram levels to 0.
911    pub fn reset(&mut self) {
912        for value in self.inner.iter_mut() {
913            *value = 0;
914        }
915    }
916
917    /// Updates histogram values using provided repetition levels
918    ///
919    /// # Panics
920    /// if any of the levels is greater than the length of the histogram (
921    /// the argument supplied to [`Self::try_new`])
922    pub fn update_from_levels(&mut self, levels: &[i16]) {
923        for &level in levels {
924            self.inner[level as usize] += 1;
925        }
926    }
927}
928
929impl From<Vec<i64>> for LevelHistogram {
930    fn from(inner: Vec<i64>) -> Self {
931        Self { inner }
932    }
933}
934
935impl From<LevelHistogram> for Vec<i64> {
936    fn from(value: LevelHistogram) -> Self {
937        value.into_inner()
938    }
939}
940
941impl HeapSize for LevelHistogram {
942    fn heap_size(&self) -> usize {
943        self.inner.heap_size()
944    }
945}
946
947/// Represents common operations for a column chunk.
948impl ColumnChunkMetaData {
949    /// Returns builder for column chunk metadata.
950    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
951        ColumnChunkMetaDataBuilder::new(column_descr)
952    }
953
954    /// File where the column chunk is stored.
955    ///
956    /// If not set, assumed to belong to the same file as the metadata.
957    /// This path is relative to the current file.
958    pub fn file_path(&self) -> Option<&str> {
959        self.file_path.as_deref()
960    }
961
962    /// Byte offset of `ColumnMetaData` in `file_path()`.
963    ///
964    /// Note that the meaning of this field has been inconsistent between implementations
965    /// so its use has since been deprecated in the Parquet specification. Modern implementations
966    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
967    /// `ColumnChunk` struct.
968    pub fn file_offset(&self) -> i64 {
969        self.file_offset
970    }
971
972    /// Type of this column. Must be primitive.
973    pub fn column_type(&self) -> Type {
974        self.column_descr.physical_type()
975    }
976
977    /// Path (or identifier) of this column.
978    pub fn column_path(&self) -> &ColumnPath {
979        self.column_descr.path()
980    }
981
982    /// Descriptor for this column.
983    pub fn column_descr(&self) -> &ColumnDescriptor {
984        self.column_descr.as_ref()
985    }
986
987    /// Reference counted clone of descriptor for this column.
988    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
989        self.column_descr.clone()
990    }
991
992    /// All encodings used for this column.
993    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
994        self.encodings.encodings()
995    }
996
997    /// All encodings used for this column, returned as a bitmask.
998    pub fn encodings_mask(&self) -> &EncodingMask {
999        &self.encodings
1000    }
1001
1002    /// Total number of values in this column chunk.
1003    pub fn num_values(&self) -> i64 {
1004        self.num_values
1005    }
1006
1007    /// Compression for this column.
1008    pub fn compression(&self) -> Compression {
1009        self.compression
1010    }
1011
1012    /// Returns the total compressed data size of this column chunk.
1013    pub fn compressed_size(&self) -> i64 {
1014        self.total_compressed_size
1015    }
1016
1017    /// Returns the total uncompressed data size of this column chunk.
1018    pub fn uncompressed_size(&self) -> i64 {
1019        self.total_uncompressed_size
1020    }
1021
1022    /// Returns the offset for the column data.
1023    pub fn data_page_offset(&self) -> i64 {
1024        self.data_page_offset
1025    }
1026
1027    /// Returns the offset for the index page.
1028    pub fn index_page_offset(&self) -> Option<i64> {
1029        self.index_page_offset
1030    }
1031
1032    /// Returns the offset for the dictionary page, if any.
1033    pub fn dictionary_page_offset(&self) -> Option<i64> {
1034        self.dictionary_page_offset
1035    }
1036
1037    /// Returns the offset and length in bytes of the column chunk within the file
1038    pub fn byte_range(&self) -> (u64, u64) {
1039        let col_start = match self.dictionary_page_offset() {
1040            Some(dictionary_page_offset) => dictionary_page_offset,
1041            None => self.data_page_offset(),
1042        };
1043        let col_len = self.compressed_size();
1044        assert!(
1045            col_start >= 0 && col_len >= 0,
1046            "column start and length should not be negative"
1047        );
1048        (col_start as u64, col_len as u64)
1049    }
1050
1051    /// Returns statistics that are set for this column chunk,
1052    /// or `None` if no statistics are available.
1053    pub fn statistics(&self) -> Option<&Statistics> {
1054        self.statistics.as_ref()
1055    }
1056
1057    /// Returns geospatial statistics that are set for this column chunk,
1058    /// or `None` if no geospatial statistics are available.
1059    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1060        self.geo_statistics.as_deref()
1061    }
1062
1063    /// Returns the page encoding statistics, or `None` if no page encoding statistics
1064    /// are available (or they were converted to a mask).
1065    ///
1066    /// Note: By default, this crate converts page encoding statistics to a mask for performance
1067    /// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`]
1068    /// to `false`.
1069    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1070        match self.encoding_stats.as_ref() {
1071            Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
1072            _ => None,
1073        }
1074    }
1075
1076    /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
1077    /// not available (or they were left in their original form).
1078    ///
1079    /// Note: This is the default behavior for this crate.
1080    ///
1081    /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
1082    /// enable fast determination of whether all pages in a column chunk are dictionary encoded
1083    /// (see <https://github.com/apache/parquet-format/pull/16>).
1084    /// Decoding the full page encoding statistics, however, can be very costly, and is not
1085    /// necessary to support the aforementioned use case. As an alternative, this crate can
1086    /// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
1087    /// used for data pages
1088    /// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
1089    /// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
1090    ///
1091    /// ```rust
1092    /// use parquet::basic::Encoding;
1093    /// use parquet::file::metadata::ColumnChunkMetaData;
1094    /// // test if all data pages in the column chunk are dictionary encoded
1095    /// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
1096    ///     // check that dictionary encoding was used
1097    ///     col_meta.dictionary_page_offset().is_some()
1098    ///         && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
1099    ///             // mask should only have one bit set, either for PLAIN_DICTIONARY or
1100    ///             // RLE_DICTIONARY
1101    ///             mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
1102    ///         })
1103    /// }
1104    /// ```
1105    pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1106        match self.encoding_stats.as_ref() {
1107            Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
1108            _ => None,
1109        }
1110    }
1111
1112    /// Returns the offset for the bloom filter.
1113    pub fn bloom_filter_offset(&self) -> Option<i64> {
1114        self.bloom_filter_offset
1115    }
1116
1117    /// Returns the offset for the bloom filter.
1118    pub fn bloom_filter_length(&self) -> Option<i32> {
1119        self.bloom_filter_length
1120    }
1121
1122    /// Returns the offset for the column index.
1123    pub fn column_index_offset(&self) -> Option<i64> {
1124        self.column_index_offset
1125    }
1126
1127    /// Returns the offset for the column index length.
1128    pub fn column_index_length(&self) -> Option<i32> {
1129        self.column_index_length
1130    }
1131
1132    /// Returns the range for the offset index if any
1133    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1134        let offset = u64::try_from(self.column_index_offset?).ok()?;
1135        let length = u64::try_from(self.column_index_length?).ok()?;
1136        Some(offset..(offset + length))
1137    }
1138
1139    /// Returns the offset for the offset index.
1140    pub fn offset_index_offset(&self) -> Option<i64> {
1141        self.offset_index_offset
1142    }
1143
1144    /// Returns the offset for the offset index length.
1145    pub fn offset_index_length(&self) -> Option<i32> {
1146        self.offset_index_length
1147    }
1148
1149    /// Returns the range for the offset index if any
1150    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1151        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1152        let length = u64::try_from(self.offset_index_length?).ok()?;
1153        Some(offset..(offset + length))
1154    }
1155
1156    /// Returns the number of bytes of variable length data after decoding.
1157    ///
1158    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1159    /// writers.
1160    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1161        self.unencoded_byte_array_data_bytes
1162    }
1163
1164    /// Returns the repetition level histogram.
1165    ///
1166    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1167    /// `vec[0]` indicates how many rows the page contains.
1168    /// This field may not be set by older writers.
1169    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1170        self.repetition_level_histogram.as_ref()
1171    }
1172
1173    /// Returns the definition level histogram.
1174    ///
1175    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1176    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1177    /// This field may not be set by older writers.
1178    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1179        self.definition_level_histogram.as_ref()
1180    }
1181
1182    /// Returns the encryption metadata for this column chunk.
1183    #[cfg(feature = "encryption")]
1184    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1185        self.column_crypto_metadata.as_deref()
1186    }
1187
1188    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1189    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1190        ColumnChunkMetaDataBuilder::from(self)
1191    }
1192}
1193
1194/// Builder for [`ColumnChunkMetaData`]
1195///
1196/// This builder is used to create a new column chunk metadata or modify an
1197/// existing one.
1198///
1199/// # Example
1200/// ```no_run
1201/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1202/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1203/// let column_chunk_metadata = get_column_chunk_metadata();
1204/// // create a new builder from existing column chunk metadata
1205/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1206/// // clear the statistics:
1207/// let column_chunk_metadata: ColumnChunkMetaData = builder
1208///   .clear_statistics()
1209///   .build()
1210///   .unwrap();
1211/// ```
1212pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1213
1214impl ColumnChunkMetaDataBuilder {
1215    /// Creates new column chunk metadata builder.
1216    ///
1217    /// See also [`ColumnChunkMetaData::builder`]
1218    fn new(column_descr: ColumnDescPtr) -> Self {
1219        Self(ColumnChunkMetaData {
1220            column_descr,
1221            encodings: Default::default(),
1222            file_path: None,
1223            file_offset: 0,
1224            num_values: 0,
1225            compression: Compression::UNCOMPRESSED,
1226            total_compressed_size: 0,
1227            total_uncompressed_size: 0,
1228            data_page_offset: 0,
1229            index_page_offset: None,
1230            dictionary_page_offset: None,
1231            statistics: None,
1232            geo_statistics: None,
1233            encoding_stats: None,
1234            bloom_filter_offset: None,
1235            bloom_filter_length: None,
1236            offset_index_offset: None,
1237            offset_index_length: None,
1238            column_index_offset: None,
1239            column_index_length: None,
1240            unencoded_byte_array_data_bytes: None,
1241            repetition_level_histogram: None,
1242            definition_level_histogram: None,
1243            #[cfg(feature = "encryption")]
1244            column_crypto_metadata: None,
1245            #[cfg(feature = "encryption")]
1246            encrypted_column_metadata: None,
1247        })
1248    }
1249
1250    /// Sets list of encodings for this column chunk.
1251    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1252        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1253        self
1254    }
1255
1256    /// Sets the encodings mask for this column chunk.
1257    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1258        self.0.encodings = encodings;
1259        self
1260    }
1261
1262    /// Sets optional file path for this column chunk.
1263    pub fn set_file_path(mut self, value: String) -> Self {
1264        self.0.file_path = Some(value);
1265        self
1266    }
1267
1268    /// Sets number of values.
1269    pub fn set_num_values(mut self, value: i64) -> Self {
1270        self.0.num_values = value;
1271        self
1272    }
1273
1274    /// Sets compression.
1275    pub fn set_compression(mut self, value: Compression) -> Self {
1276        self.0.compression = value;
1277        self
1278    }
1279
1280    /// Sets total compressed size in bytes.
1281    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1282        self.0.total_compressed_size = value;
1283        self
1284    }
1285
1286    /// Sets total uncompressed size in bytes.
1287    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1288        self.0.total_uncompressed_size = value;
1289        self
1290    }
1291
1292    /// Sets data page offset in bytes.
1293    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1294        self.0.data_page_offset = value;
1295        self
1296    }
1297
1298    /// Sets optional dictionary page offset in bytes.
1299    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1300        self.0.dictionary_page_offset = value;
1301        self
1302    }
1303
1304    /// Sets optional index page offset in bytes.
1305    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1306        self.0.index_page_offset = value;
1307        self
1308    }
1309
1310    /// Sets statistics for this column chunk.
1311    pub fn set_statistics(mut self, value: Statistics) -> Self {
1312        self.0.statistics = Some(value);
1313        self
1314    }
1315
1316    /// Sets geospatial statistics for this column chunk.
1317    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1318        self.0.geo_statistics = Some(value);
1319        self
1320    }
1321
1322    /// Clears the statistics for this column chunk.
1323    pub fn clear_statistics(mut self) -> Self {
1324        self.0.statistics = None;
1325        self
1326    }
1327
1328    /// Sets page encoding stats for this column chunk.
1329    ///
1330    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1331    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1332        self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
1333        self
1334    }
1335
1336    /// Sets page encoding stats mask for this column chunk.
1337    ///
1338    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1339    pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1340        self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
1341        self
1342    }
1343
1344    /// Clears the page encoding stats for this column chunk.
1345    pub fn clear_page_encoding_stats(mut self) -> Self {
1346        self.0.encoding_stats = None;
1347        self
1348    }
1349
1350    /// Sets optional bloom filter offset in bytes.
1351    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1352        self.0.bloom_filter_offset = value;
1353        self
1354    }
1355
1356    /// Sets optional bloom filter length in bytes.
1357    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1358        self.0.bloom_filter_length = value;
1359        self
1360    }
1361
1362    /// Sets optional offset index offset in bytes.
1363    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1364        self.0.offset_index_offset = value;
1365        self
1366    }
1367
1368    /// Sets optional offset index length in bytes.
1369    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1370        self.0.offset_index_length = value;
1371        self
1372    }
1373
1374    /// Sets optional column index offset in bytes.
1375    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1376        self.0.column_index_offset = value;
1377        self
1378    }
1379
1380    /// Sets optional column index length in bytes.
1381    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1382        self.0.column_index_length = value;
1383        self
1384    }
1385
1386    /// Sets optional length of variable length data in bytes.
1387    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1388        self.0.unencoded_byte_array_data_bytes = value;
1389        self
1390    }
1391
1392    /// Sets optional repetition level histogram
1393    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1394        self.0.repetition_level_histogram = value;
1395        self
1396    }
1397
1398    /// Sets optional repetition level histogram
1399    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1400        self.0.definition_level_histogram = value;
1401        self
1402    }
1403
1404    #[cfg(feature = "encryption")]
1405    /// Set the encryption metadata for an encrypted column
1406    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1407        self.0.column_crypto_metadata = value.map(Box::new);
1408        self
1409    }
1410
1411    #[cfg(feature = "encryption")]
1412    /// Set the encryption metadata for an encrypted column
1413    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1414        self.0.encrypted_column_metadata = value;
1415        self
1416    }
1417
1418    /// Builds column chunk metadata.
1419    pub fn build(self) -> Result<ColumnChunkMetaData> {
1420        Ok(self.0)
1421    }
1422}
1423
1424/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1425///
1426/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1427/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1428pub struct ColumnIndexBuilder {
1429    column_type: Type,
1430    null_pages: Vec<bool>,
1431    min_values: Vec<Vec<u8>>,
1432    max_values: Vec<Vec<u8>>,
1433    null_counts: Vec<i64>,
1434    boundary_order: BoundaryOrder,
1435    /// contains the concatenation of the histograms of all pages
1436    repetition_level_histograms: Option<Vec<i64>>,
1437    /// contains the concatenation of the histograms of all pages
1438    definition_level_histograms: Option<Vec<i64>>,
1439    /// Is the information in the builder valid?
1440    ///
1441    /// Set to `false` if any entry in the page doesn't have statistics for
1442    /// some reason, so statistics for that page won't be written to the file.
1443    /// This might happen if the page is entirely null, or
1444    /// is a floating point column without any non-nan values
1445    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1446    valid: bool,
1447}
1448
1449impl ColumnIndexBuilder {
1450    /// Creates a new column index builder.
1451    pub fn new(column_type: Type) -> Self {
1452        ColumnIndexBuilder {
1453            column_type,
1454            null_pages: Vec::new(),
1455            min_values: Vec::new(),
1456            max_values: Vec::new(),
1457            null_counts: Vec::new(),
1458            boundary_order: BoundaryOrder::UNORDERED,
1459            repetition_level_histograms: None,
1460            definition_level_histograms: None,
1461            valid: true,
1462        }
1463    }
1464
1465    /// Append statistics for the next page
1466    pub fn append(
1467        &mut self,
1468        null_page: bool,
1469        min_value: Vec<u8>,
1470        max_value: Vec<u8>,
1471        null_count: i64,
1472    ) {
1473        self.null_pages.push(null_page);
1474        self.min_values.push(min_value);
1475        self.max_values.push(max_value);
1476        self.null_counts.push(null_count);
1477    }
1478
1479    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1480    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1481    ///
1482    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1483    pub fn append_histograms(
1484        &mut self,
1485        repetition_level_histogram: &Option<LevelHistogram>,
1486        definition_level_histogram: &Option<LevelHistogram>,
1487    ) {
1488        if !self.valid {
1489            return;
1490        }
1491        if let Some(rep_lvl_hist) = repetition_level_histogram {
1492            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1493            hist.reserve(rep_lvl_hist.len());
1494            hist.extend(rep_lvl_hist.values());
1495        }
1496        if let Some(def_lvl_hist) = definition_level_histogram {
1497            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1498            hist.reserve(def_lvl_hist.len());
1499            hist.extend(def_lvl_hist.values());
1500        }
1501    }
1502
1503    /// Set the boundary order of the column index
1504    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1505        self.boundary_order = boundary_order;
1506    }
1507
1508    /// Mark this column index as invalid
1509    pub fn to_invalid(&mut self) {
1510        self.valid = false;
1511    }
1512
1513    /// Is the information in the builder valid?
1514    pub fn valid(&self) -> bool {
1515        self.valid
1516    }
1517
1518    /// Build and get the column index
1519    ///
1520    /// Note: callers should check [`Self::valid`] before calling this method
1521    pub fn build(self) -> Result<ColumnIndexMetaData> {
1522        Ok(match self.column_type {
1523            Type::BOOLEAN => {
1524                let index = self.build_page_index()?;
1525                ColumnIndexMetaData::BOOLEAN(index)
1526            }
1527            Type::INT32 => {
1528                let index = self.build_page_index()?;
1529                ColumnIndexMetaData::INT32(index)
1530            }
1531            Type::INT64 => {
1532                let index = self.build_page_index()?;
1533                ColumnIndexMetaData::INT64(index)
1534            }
1535            Type::INT96 => {
1536                let index = self.build_page_index()?;
1537                ColumnIndexMetaData::INT96(index)
1538            }
1539            Type::FLOAT => {
1540                let index = self.build_page_index()?;
1541                ColumnIndexMetaData::FLOAT(index)
1542            }
1543            Type::DOUBLE => {
1544                let index = self.build_page_index()?;
1545                ColumnIndexMetaData::DOUBLE(index)
1546            }
1547            Type::BYTE_ARRAY => {
1548                let index = self.build_byte_array_index()?;
1549                ColumnIndexMetaData::BYTE_ARRAY(index)
1550            }
1551            Type::FIXED_LEN_BYTE_ARRAY => {
1552                let index = self.build_byte_array_index()?;
1553                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1554            }
1555        })
1556    }
1557
1558    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1559    where
1560        T: ParquetValueType,
1561    {
1562        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1563        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1564
1565        PrimitiveColumnIndex::try_new(
1566            self.null_pages,
1567            self.boundary_order,
1568            Some(self.null_counts),
1569            self.repetition_level_histograms,
1570            self.definition_level_histograms,
1571            min_values,
1572            max_values,
1573        )
1574    }
1575
1576    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1577        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1578        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1579
1580        ByteArrayColumnIndex::try_new(
1581            self.null_pages,
1582            self.boundary_order,
1583            Some(self.null_counts),
1584            self.repetition_level_histograms,
1585            self.definition_level_histograms,
1586            min_values,
1587            max_values,
1588        )
1589    }
1590}
1591
1592impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1593    fn from(value: ColumnChunkMetaData) -> Self {
1594        ColumnChunkMetaDataBuilder(value)
1595    }
1596}
1597
1598/// Builder for offset index, part of the Parquet [PageIndex].
1599///
1600/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1601pub struct OffsetIndexBuilder {
1602    offset_array: Vec<i64>,
1603    compressed_page_size_array: Vec<i32>,
1604    first_row_index_array: Vec<i64>,
1605    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1606    current_first_row_index: i64,
1607}
1608
1609impl Default for OffsetIndexBuilder {
1610    fn default() -> Self {
1611        Self::new()
1612    }
1613}
1614
1615impl OffsetIndexBuilder {
1616    /// Creates a new offset index builder.
1617    pub fn new() -> Self {
1618        OffsetIndexBuilder {
1619            offset_array: Vec::new(),
1620            compressed_page_size_array: Vec::new(),
1621            first_row_index_array: Vec::new(),
1622            unencoded_byte_array_data_bytes_array: None,
1623            current_first_row_index: 0,
1624        }
1625    }
1626
1627    /// Append the row count of the next page.
1628    pub fn append_row_count(&mut self, row_count: i64) {
1629        let current_page_row_index = self.current_first_row_index;
1630        self.first_row_index_array.push(current_page_row_index);
1631        self.current_first_row_index += row_count;
1632    }
1633
1634    /// Append the offset and size of the next page.
1635    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1636        self.offset_array.push(offset);
1637        self.compressed_page_size_array.push(compressed_page_size);
1638    }
1639
1640    /// Append the unencoded byte array data bytes of the next page.
1641    pub fn append_unencoded_byte_array_data_bytes(
1642        &mut self,
1643        unencoded_byte_array_data_bytes: Option<i64>,
1644    ) {
1645        if let Some(val) = unencoded_byte_array_data_bytes {
1646            self.unencoded_byte_array_data_bytes_array
1647                .get_or_insert(Vec::new())
1648                .push(val);
1649        }
1650    }
1651
1652    /// Build and get the thrift metadata of offset index
1653    pub fn build(self) -> OffsetIndexMetaData {
1654        let locations = self
1655            .offset_array
1656            .iter()
1657            .zip(self.compressed_page_size_array.iter())
1658            .zip(self.first_row_index_array.iter())
1659            .map(|((offset, size), row_index)| PageLocation {
1660                offset: *offset,
1661                compressed_page_size: *size,
1662                first_row_index: *row_index,
1663            })
1664            .collect::<Vec<_>>();
1665        OffsetIndexMetaData {
1666            page_locations: locations,
1667            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1668        }
1669    }
1670}
1671
1672#[cfg(test)]
1673mod tests {
1674    use super::*;
1675    use crate::basic::{PageType, SortOrder};
1676    use crate::file::metadata::thrift::tests::{
1677        read_column_chunk, read_column_chunk_with_options, read_row_group,
1678    };
1679
1680    #[test]
1681    fn test_row_group_metadata_thrift_conversion() {
1682        let schema_descr = get_test_schema_descr();
1683
1684        let mut columns = vec![];
1685        for ptr in schema_descr.columns() {
1686            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1687            columns.push(column);
1688        }
1689        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1690            .set_num_rows(1000)
1691            .set_total_byte_size(2000)
1692            .set_column_metadata(columns)
1693            .set_ordinal(1)
1694            .build()
1695            .unwrap();
1696
1697        let mut buf = Vec::new();
1698        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1699        row_group_meta.write_thrift(&mut writer).unwrap();
1700
1701        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1702
1703        assert_eq!(row_group_res, row_group_meta);
1704    }
1705
1706    #[test]
1707    fn test_row_group_metadata_thrift_conversion_empty() {
1708        let schema_descr = get_test_schema_descr();
1709
1710        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1711
1712        assert!(row_group_meta.is_err());
1713        if let Err(e) = row_group_meta {
1714            assert_eq!(
1715                format!("{e}"),
1716                "Parquet error: Column length mismatch: 2 != 0"
1717            );
1718        }
1719    }
1720
1721    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1722    #[test]
1723    fn test_row_group_metadata_thrift_corrupted() {
1724        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1725            SchemaType::group_type_builder("schema")
1726                .with_fields(vec![
1727                    Arc::new(
1728                        SchemaType::primitive_type_builder("a", Type::INT32)
1729                            .build()
1730                            .unwrap(),
1731                    ),
1732                    Arc::new(
1733                        SchemaType::primitive_type_builder("b", Type::INT32)
1734                            .build()
1735                            .unwrap(),
1736                    ),
1737                ])
1738                .build()
1739                .unwrap(),
1740        )));
1741
1742        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1743            SchemaType::group_type_builder("schema")
1744                .with_fields(vec![
1745                    Arc::new(
1746                        SchemaType::primitive_type_builder("a", Type::INT32)
1747                            .build()
1748                            .unwrap(),
1749                    ),
1750                    Arc::new(
1751                        SchemaType::primitive_type_builder("b", Type::INT32)
1752                            .build()
1753                            .unwrap(),
1754                    ),
1755                    Arc::new(
1756                        SchemaType::primitive_type_builder("c", Type::INT32)
1757                            .build()
1758                            .unwrap(),
1759                    ),
1760                ])
1761                .build()
1762                .unwrap(),
1763        )));
1764
1765        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1766            .set_num_rows(1000)
1767            .set_total_byte_size(2000)
1768            .set_column_metadata(vec![
1769                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1770                    .build()
1771                    .unwrap(),
1772                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1773                    .build()
1774                    .unwrap(),
1775            ])
1776            .set_ordinal(1)
1777            .build()
1778            .unwrap();
1779        let mut buf = Vec::new();
1780        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1781        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1782
1783        let err = read_row_group(&mut buf, schema_descr_3cols)
1784            .unwrap_err()
1785            .to_string();
1786        assert_eq!(
1787            err,
1788            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1789        );
1790    }
1791
1792    #[test]
1793    fn test_column_chunk_metadata_thrift_conversion() {
1794        let column_descr = get_test_schema_descr().column(0);
1795        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1796            .set_encodings_mask(EncodingMask::new_from_encodings(
1797                [Encoding::PLAIN, Encoding::RLE].iter(),
1798            ))
1799            .set_file_path("file_path".to_owned())
1800            .set_num_values(1000)
1801            .set_compression(Compression::SNAPPY)
1802            .set_total_compressed_size(2000)
1803            .set_total_uncompressed_size(3000)
1804            .set_data_page_offset(4000)
1805            .set_dictionary_page_offset(Some(5000))
1806            .set_page_encoding_stats(vec![
1807                PageEncodingStats {
1808                    page_type: PageType::DATA_PAGE,
1809                    encoding: Encoding::PLAIN,
1810                    count: 3,
1811                },
1812                PageEncodingStats {
1813                    page_type: PageType::DATA_PAGE,
1814                    encoding: Encoding::RLE,
1815                    count: 5,
1816                },
1817            ])
1818            .set_bloom_filter_offset(Some(6000))
1819            .set_bloom_filter_length(Some(25))
1820            .set_offset_index_offset(Some(7000))
1821            .set_offset_index_length(Some(25))
1822            .set_column_index_offset(Some(8000))
1823            .set_column_index_length(Some(25))
1824            .set_unencoded_byte_array_data_bytes(Some(2000))
1825            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1826            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1827            .build()
1828            .unwrap();
1829
1830        let mut buf = Vec::new();
1831        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1832        col_metadata.write_thrift(&mut writer).unwrap();
1833        let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1834
1835        let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1836            .set_encodings_mask(EncodingMask::new_from_encodings(
1837                [Encoding::PLAIN, Encoding::RLE].iter(),
1838            ))
1839            .set_file_path("file_path".to_owned())
1840            .set_num_values(1000)
1841            .set_compression(Compression::SNAPPY)
1842            .set_total_compressed_size(2000)
1843            .set_total_uncompressed_size(3000)
1844            .set_data_page_offset(4000)
1845            .set_dictionary_page_offset(Some(5000))
1846            .set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1847                [Encoding::PLAIN, Encoding::RLE].iter(),
1848            ))
1849            .set_bloom_filter_offset(Some(6000))
1850            .set_bloom_filter_length(Some(25))
1851            .set_offset_index_offset(Some(7000))
1852            .set_offset_index_length(Some(25))
1853            .set_column_index_offset(Some(8000))
1854            .set_column_index_length(Some(25))
1855            .set_unencoded_byte_array_data_bytes(Some(2000))
1856            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1857            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1858            .build()
1859            .unwrap();
1860
1861        assert_eq!(col_chunk_res, expected_metadata);
1862    }
1863
1864    #[test]
1865    fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1866        let column_descr = get_test_schema_descr().column(0);
1867        let stats = vec![
1868            PageEncodingStats {
1869                page_type: PageType::DATA_PAGE,
1870                encoding: Encoding::PLAIN,
1871                count: 3,
1872            },
1873            PageEncodingStats {
1874                page_type: PageType::DATA_PAGE,
1875                encoding: Encoding::RLE,
1876                count: 5,
1877            },
1878        ];
1879        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1880            .set_encodings_mask(EncodingMask::new_from_encodings(
1881                [Encoding::PLAIN, Encoding::RLE].iter(),
1882            ))
1883            .set_num_values(1000)
1884            .set_compression(Compression::SNAPPY)
1885            .set_total_compressed_size(2000)
1886            .set_total_uncompressed_size(3000)
1887            .set_data_page_offset(4000)
1888            .set_page_encoding_stats(stats)
1889            .build()
1890            .unwrap();
1891
1892        let mut buf = Vec::new();
1893        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1894        col_metadata.write_thrift(&mut writer).unwrap();
1895
1896        let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1897        let col_chunk_res =
1898            read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
1899
1900        assert_eq!(col_chunk_res, col_metadata);
1901    }
1902
1903    #[test]
1904    fn test_column_chunk_metadata_thrift_conversion_empty() {
1905        let column_descr = get_test_schema_descr().column(0);
1906
1907        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1908            .build()
1909            .unwrap();
1910
1911        let mut buf = Vec::new();
1912        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1913        col_metadata.write_thrift(&mut writer).unwrap();
1914        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1915
1916        assert_eq!(col_chunk_res, col_metadata);
1917    }
1918
1919    #[test]
1920    fn test_compressed_size() {
1921        let schema_descr = get_test_schema_descr();
1922
1923        let mut columns = vec![];
1924        for column_descr in schema_descr.columns() {
1925            let column = ColumnChunkMetaData::builder(column_descr.clone())
1926                .set_total_compressed_size(500)
1927                .set_total_uncompressed_size(700)
1928                .build()
1929                .unwrap();
1930            columns.push(column);
1931        }
1932        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1933            .set_num_rows(1000)
1934            .set_column_metadata(columns)
1935            .build()
1936            .unwrap();
1937
1938        let compressed_size_res: i64 = row_group_meta.compressed_size();
1939        let compressed_size_exp: i64 = 1000;
1940
1941        assert_eq!(compressed_size_res, compressed_size_exp);
1942    }
1943
1944    #[test]
1945    fn test_memory_size() {
1946        let schema_descr = get_test_schema_descr();
1947
1948        let columns = schema_descr
1949            .columns()
1950            .iter()
1951            .map(|column_descr| {
1952                ColumnChunkMetaData::builder(column_descr.clone())
1953                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1954                    .build()
1955            })
1956            .collect::<Result<Vec<_>>>()
1957            .unwrap();
1958        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1959            .set_num_rows(1000)
1960            .set_column_metadata(columns)
1961            .build()
1962            .unwrap();
1963        let row_group_meta = vec![row_group_meta];
1964
1965        let version = 2;
1966        let num_rows = 1000;
1967        let created_by = Some(String::from("test harness"));
1968        let key_value_metadata = Some(vec![KeyValue::new(
1969            String::from("Foo"),
1970            Some(String::from("bar")),
1971        )]);
1972        let column_orders = Some(vec![
1973            ColumnOrder::UNDEFINED,
1974            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1975        ]);
1976        let file_metadata = FileMetaData::new(
1977            version,
1978            num_rows,
1979            created_by,
1980            key_value_metadata,
1981            schema_descr.clone(),
1982            column_orders,
1983        );
1984
1985        // Now, add in Exact Statistics
1986        let columns_with_stats = schema_descr
1987            .columns()
1988            .iter()
1989            .map(|column_descr| {
1990                ColumnChunkMetaData::builder(column_descr.clone())
1991                    .set_statistics(Statistics::new::<i32>(
1992                        Some(0),
1993                        Some(100),
1994                        None,
1995                        None,
1996                        false,
1997                    ))
1998                    .build()
1999            })
2000            .collect::<Result<Vec<_>>>()
2001            .unwrap();
2002
2003        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
2004            .set_num_rows(1000)
2005            .set_column_metadata(columns_with_stats)
2006            .build()
2007            .unwrap();
2008        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
2009
2010        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
2011            .set_row_groups(row_group_meta_with_stats)
2012            .build();
2013
2014        #[cfg(not(feature = "encryption"))]
2015        let base_expected_size = 2766;
2016        #[cfg(feature = "encryption")]
2017        let base_expected_size = 2934;
2018
2019        assert_eq!(parquet_meta.memory_size(), base_expected_size);
2020
2021        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
2022        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2023        let column_index = column_index.build().unwrap();
2024        let native_index = match column_index {
2025            ColumnIndexMetaData::BOOLEAN(index) => index,
2026            _ => panic!("wrong type of column index"),
2027        };
2028
2029        // Now, add in OffsetIndex
2030        let mut offset_index = OffsetIndexBuilder::new();
2031        offset_index.append_row_count(1);
2032        offset_index.append_offset_and_size(2, 3);
2033        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2034        offset_index.append_row_count(1);
2035        offset_index.append_offset_and_size(2, 3);
2036        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2037        let offset_index = offset_index.build();
2038
2039        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2040            .set_row_groups(row_group_meta)
2041            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
2042            .set_offset_index(Some(vec![vec![offset_index]]))
2043            .build();
2044
2045        #[cfg(not(feature = "encryption"))]
2046        let bigger_expected_size = 3192;
2047        #[cfg(feature = "encryption")]
2048        let bigger_expected_size = 3360;
2049
2050        // more set fields means more memory usage
2051        assert!(bigger_expected_size > base_expected_size);
2052        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2053    }
2054
2055    #[test]
2056    #[cfg(feature = "encryption")]
2057    fn test_memory_size_with_decryptor() {
2058        use crate::encryption::decrypt::FileDecryptionProperties;
2059        use crate::file::metadata::thrift::encryption::AesGcmV1;
2060
2061        let schema_descr = get_test_schema_descr();
2062
2063        let columns = schema_descr
2064            .columns()
2065            .iter()
2066            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
2067            .collect::<Result<Vec<_>>>()
2068            .unwrap();
2069        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
2070            .set_num_rows(1000)
2071            .set_column_metadata(columns)
2072            .build()
2073            .unwrap();
2074        let row_group_meta = vec![row_group_meta];
2075
2076        let version = 2;
2077        let num_rows = 1000;
2078        let aad_file_unique = vec![1u8; 8];
2079        let aad_prefix = vec![2u8; 8];
2080        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
2081            aad_prefix: Some(aad_prefix.clone()),
2082            aad_file_unique: Some(aad_file_unique.clone()),
2083            supply_aad_prefix: Some(true),
2084        });
2085        let footer_key_metadata = Some(vec![3u8; 8]);
2086        let file_metadata =
2087            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
2088                .with_encryption_algorithm(Some(encryption_algorithm))
2089                .with_footer_signing_key_metadata(footer_key_metadata.clone());
2090
2091        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2092            .set_row_groups(row_group_meta.clone())
2093            .build();
2094
2095        let base_expected_size = 2058;
2096        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
2097
2098        let footer_key = "0123456789012345".as_bytes();
2099        let column_key = "1234567890123450".as_bytes();
2100        let mut decryption_properties_builder =
2101            FileDecryptionProperties::builder(footer_key.to_vec())
2102                .with_aad_prefix(aad_prefix.clone());
2103        for column in schema_descr.columns() {
2104            decryption_properties_builder = decryption_properties_builder
2105                .with_column_key(&column.path().string(), column_key.to_vec());
2106        }
2107        let decryption_properties = decryption_properties_builder.build().unwrap();
2108        let decryptor = FileDecryptor::new(
2109            &decryption_properties,
2110            footer_key_metadata.as_deref(),
2111            aad_file_unique,
2112            aad_prefix,
2113        )
2114        .unwrap();
2115
2116        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2117            .set_row_groups(row_group_meta.clone())
2118            .set_file_decryptor(Some(decryptor))
2119            .build();
2120
2121        let expected_size_with_decryptor = 3072;
2122        assert!(expected_size_with_decryptor > base_expected_size);
2123
2124        assert_eq!(
2125            parquet_meta_data.memory_size(),
2126            expected_size_with_decryptor
2127        );
2128    }
2129
2130    /// Returns sample schema descriptor so we can create column metadata.
2131    fn get_test_schema_descr() -> SchemaDescPtr {
2132        let schema = SchemaType::group_type_builder("schema")
2133            .with_fields(vec![
2134                Arc::new(
2135                    SchemaType::primitive_type_builder("a", Type::INT32)
2136                        .build()
2137                        .unwrap(),
2138                ),
2139                Arc::new(
2140                    SchemaType::primitive_type_builder("b", Type::INT32)
2141                        .build()
2142                        .unwrap(),
2143                ),
2144            ])
2145            .build()
2146            .unwrap();
2147
2148        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2149    }
2150}