parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//! # Examples
46//!
47//! Please see [`external_metadata.rs`]
48//!
49//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
50//!
51//! # Metadata Encodings and Structures
52//!
53//! There are three different encodings of Parquet Metadata in this crate:
54//!
55//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
56//!    [parquet.thrift]
57//!
58//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
59//!    from [parquet.thrift]. These structures are low level and mirror
60//!    the thrift definitions.
61//!
62//! 3. [`file::metadata`] (this module): Easier to use Rust structures
63//!    with a more idiomatic API. Note that, confusingly, some but not all
64//!    of these structures have the same name as the [`format`] structures.
65//!
66//! [`file::metadata`]: crate::file::metadata
67//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
68//!
69//! Graphically, this is how the different structures relate to each other:
70//!
71//! ```text
72//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
73//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
74//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
75//!                            └──────────────┘     │         └───────────────────────┘ │
76//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
77//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
78//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
79//!                                     ...         │                   ...             │
80//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
81//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
82//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
83//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
84//!
85//!                          format::meta structures          file::metadata structures
86//!
87//!                         * Same name, different struct
88//! ```
89mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112    Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116    basic::BoundaryOrder,
117    errors::{ParquetError, Result},
118};
119use crate::{
120    basic::{ColumnOrder, Compression, Encoding, Type},
121    parquet_thrift::{
122        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124    },
125};
126use crate::{
127    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use options::ParquetMetaDataOptions;
132pub use push_decoder::ParquetMetaDataPushDecoder;
133pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
134use std::io::Write;
135use std::ops::Range;
136use std::sync::Arc;
137pub use writer::ParquetMetaDataWriter;
138pub(crate) use writer::ThriftMetadataWriter;
139
140/// Page level statistics for each column chunk of each row group.
141///
142/// This structure is an in-memory representation of multiple [`ColumnIndex`]
143/// structures in a parquet file footer, as described in the Parquet [PageIndex
144/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
145/// particular column chunk.
146///
147/// `column_index[row_group_number][column_number]` holds the
148/// [`ColumnIndex`] corresponding to column `column_number` of row group
149/// `row_group_number`.
150///
151/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
152/// column in the third row group of the parquet file.
153///
154/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
155/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
156pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
157
158/// [`OffsetIndexMetaData`] for each data page of each row group of each column
159///
160/// This structure is the parsed representation of the [`OffsetIndex`] from the
161/// Parquet file footer, as described in the Parquet [PageIndex documentation].
162///
163/// `offset_index[row_group_number][column_number]` holds
164/// the [`OffsetIndexMetaData`] corresponding to column
165/// `column_number`of row group `row_group_number`.
166///
167/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
168/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
169pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
170
171/// Parsed metadata for a single Parquet file
172///
173/// This structure is stored in the footer of Parquet files, in the format
174/// defined by [`parquet.thrift`].
175///
176/// # Overview
177/// The fields of this structure are:
178/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
179/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
180/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
181///
182/// This structure is read by the various readers in this crate or can be read
183/// directly from a file using the [`ParquetMetaDataReader`] struct.
184///
185/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
186///
187/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
188#[derive(Debug, Clone, PartialEq)]
189pub struct ParquetMetaData {
190    /// File level metadata
191    file_metadata: FileMetaData,
192    /// Row group metadata
193    row_groups: Vec<RowGroupMetaData>,
194    /// Page level index for each page in each column chunk
195    column_index: Option<ParquetColumnIndex>,
196    /// Offset index for each page in each column chunk
197    offset_index: Option<ParquetOffsetIndex>,
198    /// Optional file decryptor
199    #[cfg(feature = "encryption")]
200    file_decryptor: Option<Box<FileDecryptor>>,
201}
202
203impl ParquetMetaData {
204    /// Creates Parquet metadata from file metadata and a list of row
205    /// group metadata
206    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
207        ParquetMetaData {
208            file_metadata,
209            row_groups,
210            column_index: None,
211            offset_index: None,
212            #[cfg(feature = "encryption")]
213            file_decryptor: None,
214        }
215    }
216
217    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
218    /// encrypted data.
219    #[cfg(feature = "encryption")]
220    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
221        self.file_decryptor = file_decryptor.map(Box::new);
222    }
223
224    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
225    pub fn into_builder(self) -> ParquetMetaDataBuilder {
226        self.into()
227    }
228
229    /// Returns file metadata as reference.
230    pub fn file_metadata(&self) -> &FileMetaData {
231        &self.file_metadata
232    }
233
234    /// Returns file decryptor as reference.
235    #[cfg(feature = "encryption")]
236    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
237        self.file_decryptor.as_deref()
238    }
239
240    /// Returns number of row groups in this file.
241    pub fn num_row_groups(&self) -> usize {
242        self.row_groups.len()
243    }
244
245    /// Returns row group metadata for `i`th position.
246    /// Position should be less than number of row groups `num_row_groups`.
247    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
248        &self.row_groups[i]
249    }
250
251    /// Returns slice of row groups in this file.
252    pub fn row_groups(&self) -> &[RowGroupMetaData] {
253        &self.row_groups
254    }
255
256    /// Returns the column index for this file if loaded
257    ///
258    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
259    /// [ArrowReaderOptions::with_page_index] was set to false.
260    ///
261    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
262    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
263        self.column_index.as_ref()
264    }
265
266    /// Returns offset indexes in this file, if loaded
267    ///
268    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
269    /// [ArrowReaderOptions::with_page_index] was set to false.
270    ///
271    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
272    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
273        self.offset_index.as_ref()
274    }
275
276    /// Estimate of the bytes allocated to store `ParquetMetadata`
277    ///
278    /// # Notes:
279    ///
280    /// 1. Includes size of self
281    ///
282    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
283    ///    [`RowGroupMetaData`].
284    ///
285    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
286    ///    means `memory_size` will over estimate the memory size if such pointers
287    ///    are shared.
288    ///
289    /// 4. Does not include any allocator overheads
290    pub fn memory_size(&self) -> usize {
291        #[cfg(feature = "encryption")]
292        let encryption_size = self.file_decryptor.heap_size();
293        #[cfg(not(feature = "encryption"))]
294        let encryption_size = 0usize;
295
296        std::mem::size_of::<Self>()
297            + self.file_metadata.heap_size()
298            + self.row_groups.heap_size()
299            + self.column_index.heap_size()
300            + self.offset_index.heap_size()
301            + encryption_size
302    }
303
304    /// Override the column index
305    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
306        self.column_index = index;
307    }
308
309    /// Override the offset index
310    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
311        self.offset_index = index;
312    }
313}
314
315/// A builder for creating / manipulating [`ParquetMetaData`]
316///
317/// # Example creating a new [`ParquetMetaData`]
318///
319///```no_run
320/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
321/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
322/// // Create a new builder given the file metadata
323/// let file_metadata = get_file_metadata();
324/// // Create a row group
325/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
326///    .set_num_rows(100)
327///    // ... (A real row group needs more than just the number of rows)
328///    .build()
329///    .unwrap();
330/// // Create the final metadata
331/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
332///   .add_row_group(row_group)
333///   .build();
334/// ```
335///
336/// # Example modifying an existing [`ParquetMetaData`]
337/// ```no_run
338/// # use parquet::file::metadata::ParquetMetaData;
339/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
340/// // Modify the metadata so only the last RowGroup remains
341/// let metadata: ParquetMetaData = load_metadata();
342/// let mut builder = metadata.into_builder();
343///
344/// // Take existing row groups to modify
345/// let mut row_groups = builder.take_row_groups();
346/// let last_row_group = row_groups.pop().unwrap();
347///
348/// let metadata = builder
349///   .add_row_group(last_row_group)
350///   .build();
351/// ```
352pub struct ParquetMetaDataBuilder(ParquetMetaData);
353
354impl ParquetMetaDataBuilder {
355    /// Create a new builder from a file metadata, with no row groups
356    pub fn new(file_meta_data: FileMetaData) -> Self {
357        Self(ParquetMetaData::new(file_meta_data, vec![]))
358    }
359
360    /// Create a new builder from an existing ParquetMetaData
361    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
362        Self(metadata)
363    }
364
365    /// Adds a row group to the metadata
366    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
367        self.0.row_groups.push(row_group);
368        self
369    }
370
371    /// Sets all the row groups to the specified list
372    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
373        self.0.row_groups = row_groups;
374        self
375    }
376
377    /// Takes ownership of the row groups in this builder, and clears the list
378    /// of row groups.
379    ///
380    /// This can be used for more efficient creation of a new ParquetMetaData
381    /// from an existing one.
382    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
383        std::mem::take(&mut self.0.row_groups)
384    }
385
386    /// Return a reference to the current row groups
387    pub fn row_groups(&self) -> &[RowGroupMetaData] {
388        &self.0.row_groups
389    }
390
391    /// Sets the column index
392    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
393        self.0.column_index = column_index;
394        self
395    }
396
397    /// Returns the current column index from the builder, replacing it with `None`
398    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
399        std::mem::take(&mut self.0.column_index)
400    }
401
402    /// Return a reference to the current column index, if any
403    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
404        self.0.column_index.as_ref()
405    }
406
407    /// Sets the offset index
408    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
409        self.0.offset_index = offset_index;
410        self
411    }
412
413    /// Returns the current offset index from the builder, replacing it with `None`
414    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
415        std::mem::take(&mut self.0.offset_index)
416    }
417
418    /// Return a reference to the current offset index, if any
419    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
420        self.0.offset_index.as_ref()
421    }
422
423    /// Sets the file decryptor needed to decrypt this metadata.
424    #[cfg(feature = "encryption")]
425    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
426        self.0.with_file_decryptor(file_decryptor);
427        self
428    }
429
430    /// Creates a new ParquetMetaData from the builder
431    pub fn build(self) -> ParquetMetaData {
432        let Self(metadata) = self;
433        metadata
434    }
435}
436
437impl From<ParquetMetaData> for ParquetMetaDataBuilder {
438    fn from(meta_data: ParquetMetaData) -> Self {
439        Self(meta_data)
440    }
441}
442
443thrift_struct!(
444/// A key-value pair for [`FileMetaData`].
445pub struct KeyValue {
446  1: required string key
447  2: optional string value
448}
449);
450
451impl KeyValue {
452    /// Create a new key value pair
453    pub fn new<F2>(key: String, value: F2) -> KeyValue
454    where
455        F2: Into<Option<String>>,
456    {
457        KeyValue {
458            key,
459            value: value.into(),
460        }
461    }
462}
463
464thrift_struct!(
465/// PageEncodingStats for a column chunk and data page.
466pub struct PageEncodingStats {
467  1: required PageType page_type;
468  2: required Encoding encoding;
469  3: required i32 count;
470}
471);
472
473/// Reference counted pointer for [`FileMetaData`].
474pub type FileMetaDataPtr = Arc<FileMetaData>;
475
476/// File level metadata for a Parquet file.
477///
478/// Includes the version of the file, metadata, number of rows, schema, and column orders
479#[derive(Debug, Clone, PartialEq)]
480pub struct FileMetaData {
481    version: i32,
482    num_rows: i64,
483    created_by: Option<String>,
484    key_value_metadata: Option<Vec<KeyValue>>,
485    schema_descr: SchemaDescPtr,
486    column_orders: Option<Vec<ColumnOrder>>,
487    #[cfg(feature = "encryption")]
488    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
489    #[cfg(feature = "encryption")]
490    footer_signing_key_metadata: Option<Vec<u8>>,
491}
492
493impl FileMetaData {
494    /// Creates new file metadata.
495    pub fn new(
496        version: i32,
497        num_rows: i64,
498        created_by: Option<String>,
499        key_value_metadata: Option<Vec<KeyValue>>,
500        schema_descr: SchemaDescPtr,
501        column_orders: Option<Vec<ColumnOrder>>,
502    ) -> Self {
503        FileMetaData {
504            version,
505            num_rows,
506            created_by,
507            key_value_metadata,
508            schema_descr,
509            column_orders,
510            #[cfg(feature = "encryption")]
511            encryption_algorithm: None,
512            #[cfg(feature = "encryption")]
513            footer_signing_key_metadata: None,
514        }
515    }
516
517    #[cfg(feature = "encryption")]
518    pub(crate) fn with_encryption_algorithm(
519        mut self,
520        encryption_algorithm: Option<EncryptionAlgorithm>,
521    ) -> Self {
522        self.encryption_algorithm = encryption_algorithm.map(Box::new);
523        self
524    }
525
526    #[cfg(feature = "encryption")]
527    pub(crate) fn with_footer_signing_key_metadata(
528        mut self,
529        footer_signing_key_metadata: Option<Vec<u8>>,
530    ) -> Self {
531        self.footer_signing_key_metadata = footer_signing_key_metadata;
532        self
533    }
534
535    /// Returns version of this file.
536    pub fn version(&self) -> i32 {
537        self.version
538    }
539
540    /// Returns number of rows in the file.
541    pub fn num_rows(&self) -> i64 {
542        self.num_rows
543    }
544
545    /// String message for application that wrote this file.
546    ///
547    /// This should have the following format:
548    /// `<application> version <application version> (build <application build hash>)`.
549    ///
550    /// ```shell
551    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
552    /// ```
553    pub fn created_by(&self) -> Option<&str> {
554        self.created_by.as_deref()
555    }
556
557    /// Returns key_value_metadata of this file.
558    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
559        self.key_value_metadata.as_ref()
560    }
561
562    /// Returns Parquet [`Type`] that describes schema in this file.
563    ///
564    /// [`Type`]: crate::schema::types::Type
565    pub fn schema(&self) -> &SchemaType {
566        self.schema_descr.root_schema()
567    }
568
569    /// Returns a reference to schema descriptor.
570    pub fn schema_descr(&self) -> &SchemaDescriptor {
571        &self.schema_descr
572    }
573
574    /// Returns reference counted clone for schema descriptor.
575    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
576        self.schema_descr.clone()
577    }
578
579    /// Column (sort) order used for `min` and `max` values of each column in this file.
580    ///
581    /// Each column order corresponds to one column, determined by its position in the
582    /// list, matching the position of the column in the schema.
583    ///
584    /// When `None` is returned, there are no column orders available, and each column
585    /// should be assumed to have undefined (legacy) column order.
586    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
587        self.column_orders.as_ref()
588    }
589
590    /// Returns column order for `i`th column in this file.
591    /// If column orders are not available, returns undefined (legacy) column order.
592    pub fn column_order(&self, i: usize) -> ColumnOrder {
593        self.column_orders
594            .as_ref()
595            .map(|data| data[i])
596            .unwrap_or(ColumnOrder::UNDEFINED)
597    }
598}
599
600thrift_struct!(
601/// Sort order within a RowGroup of a leaf column
602pub struct SortingColumn {
603  /// The ordinal position of the column (in this row group)
604  1: required i32 column_idx
605
606  /// If true, indicates this column is sorted in descending order.
607  2: required bool descending
608
609  /// If true, nulls will come before non-null values, otherwise,
610  /// nulls go at the end. */
611  3: required bool nulls_first
612}
613);
614
615/// Reference counted pointer for [`RowGroupMetaData`].
616pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
617
618/// Metadata for a row group
619///
620/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
621/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
622#[derive(Debug, Clone, PartialEq)]
623pub struct RowGroupMetaData {
624    columns: Vec<ColumnChunkMetaData>,
625    num_rows: i64,
626    sorting_columns: Option<Vec<SortingColumn>>,
627    total_byte_size: i64,
628    schema_descr: SchemaDescPtr,
629    /// We can't infer from file offset of first column since there may empty columns in row group.
630    file_offset: Option<i64>,
631    /// Ordinal position of this row group in file
632    ordinal: Option<i16>,
633}
634
635impl RowGroupMetaData {
636    /// Returns builder for row group metadata.
637    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
638        RowGroupMetaDataBuilder::new(schema_descr)
639    }
640
641    /// Number of columns in this row group.
642    pub fn num_columns(&self) -> usize {
643        self.columns.len()
644    }
645
646    /// Returns column chunk metadata for `i`th column.
647    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
648        &self.columns[i]
649    }
650
651    /// Returns slice of column chunk metadata.
652    pub fn columns(&self) -> &[ColumnChunkMetaData] {
653        &self.columns
654    }
655
656    /// Returns mutable slice of column chunk metadata.
657    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
658        &mut self.columns
659    }
660
661    /// Number of rows in this row group.
662    pub fn num_rows(&self) -> i64 {
663        self.num_rows
664    }
665
666    /// Returns the sort ordering of the rows in this RowGroup if any
667    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
668        self.sorting_columns.as_ref()
669    }
670
671    /// Total byte size of all uncompressed column data in this row group.
672    pub fn total_byte_size(&self) -> i64 {
673        self.total_byte_size
674    }
675
676    /// Total size of all compressed column data in this row group.
677    pub fn compressed_size(&self) -> i64 {
678        self.columns.iter().map(|c| c.total_compressed_size).sum()
679    }
680
681    /// Returns reference to a schema descriptor.
682    pub fn schema_descr(&self) -> &SchemaDescriptor {
683        self.schema_descr.as_ref()
684    }
685
686    /// Returns reference counted clone of schema descriptor.
687    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
688        self.schema_descr.clone()
689    }
690
691    /// Returns ordinal position of this row group in file.
692    ///
693    /// For example if this is the first row group in the file, this will return 0.
694    /// If this is the second row group in the file, this will return 1.
695    #[inline(always)]
696    pub fn ordinal(&self) -> Option<i16> {
697        self.ordinal
698    }
699
700    /// Returns file offset of this row group in file.
701    #[inline(always)]
702    pub fn file_offset(&self) -> Option<i64> {
703        self.file_offset
704    }
705
706    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
707    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
708        RowGroupMetaDataBuilder(self)
709    }
710}
711
712/// Builder for row group metadata.
713pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
714
715impl RowGroupMetaDataBuilder {
716    /// Creates new builder from schema descriptor.
717    fn new(schema_descr: SchemaDescPtr) -> Self {
718        Self(RowGroupMetaData {
719            columns: Vec::with_capacity(schema_descr.num_columns()),
720            schema_descr,
721            file_offset: None,
722            num_rows: 0,
723            sorting_columns: None,
724            total_byte_size: 0,
725            ordinal: None,
726        })
727    }
728
729    /// Sets number of rows in this row group.
730    pub fn set_num_rows(mut self, value: i64) -> Self {
731        self.0.num_rows = value;
732        self
733    }
734
735    /// Sets the sorting order for columns
736    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
737        self.0.sorting_columns = value;
738        self
739    }
740
741    /// Sets total size in bytes for this row group.
742    pub fn set_total_byte_size(mut self, value: i64) -> Self {
743        self.0.total_byte_size = value;
744        self
745    }
746
747    /// Takes ownership of the the column metadata in this builder, and clears
748    /// the list of columns.
749    ///
750    /// This can be used for more efficient creation of a new RowGroupMetaData
751    /// from an existing one.
752    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
753        std::mem::take(&mut self.0.columns)
754    }
755
756    /// Sets column metadata for this row group.
757    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
758        self.0.columns = value;
759        self
760    }
761
762    /// Adds a column metadata to this row group
763    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
764        self.0.columns.push(value);
765        self
766    }
767
768    /// Sets ordinal for this row group.
769    pub fn set_ordinal(mut self, value: i16) -> Self {
770        self.0.ordinal = Some(value);
771        self
772    }
773
774    /// Sets file offset for this row group.
775    pub fn set_file_offset(mut self, value: i64) -> Self {
776        self.0.file_offset = Some(value);
777        self
778    }
779
780    /// Builds row group metadata.
781    pub fn build(self) -> Result<RowGroupMetaData> {
782        if self.0.schema_descr.num_columns() != self.0.columns.len() {
783            return Err(general_err!(
784                "Column length mismatch: {} != {}",
785                self.0.schema_descr.num_columns(),
786                self.0.columns.len()
787            ));
788        }
789
790        Ok(self.0)
791    }
792
793    /// Build row group metadata without validation.
794    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
795        self.0
796    }
797}
798
799/// Metadata for a column chunk.
800#[derive(Debug, Clone, PartialEq)]
801pub struct ColumnChunkMetaData {
802    column_descr: ColumnDescPtr,
803    encodings: EncodingMask,
804    file_path: Option<String>,
805    file_offset: i64,
806    num_values: i64,
807    compression: Compression,
808    total_compressed_size: i64,
809    total_uncompressed_size: i64,
810    data_page_offset: i64,
811    index_page_offset: Option<i64>,
812    dictionary_page_offset: Option<i64>,
813    statistics: Option<Statistics>,
814    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
815    encoding_stats: Option<Vec<PageEncodingStats>>,
816    bloom_filter_offset: Option<i64>,
817    bloom_filter_length: Option<i32>,
818    offset_index_offset: Option<i64>,
819    offset_index_length: Option<i32>,
820    column_index_offset: Option<i64>,
821    column_index_length: Option<i32>,
822    unencoded_byte_array_data_bytes: Option<i64>,
823    repetition_level_histogram: Option<LevelHistogram>,
824    definition_level_histogram: Option<LevelHistogram>,
825    #[cfg(feature = "encryption")]
826    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
827    #[cfg(feature = "encryption")]
828    encrypted_column_metadata: Option<Vec<u8>>,
829}
830
831/// Histograms for repetition and definition levels.
832///
833/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
834/// values at level `i`.
835///
836/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
837/// number of rows with level 1, and so on.
838///
839#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
840pub struct LevelHistogram {
841    inner: Vec<i64>,
842}
843
844impl LevelHistogram {
845    /// Creates a new level histogram data.
846    ///
847    /// Length will be `max_level + 1`.
848    ///
849    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
850    pub fn try_new(max_level: i16) -> Option<Self> {
851        if max_level > 0 {
852            Some(Self {
853                inner: vec![0; max_level as usize + 1],
854            })
855        } else {
856            None
857        }
858    }
859    /// Returns a reference to the the histogram's values.
860    pub fn values(&self) -> &[i64] {
861        &self.inner
862    }
863
864    /// Return the inner vector, consuming self
865    pub fn into_inner(self) -> Vec<i64> {
866        self.inner
867    }
868
869    /// Returns the histogram value at the given index.
870    ///
871    /// The value of `i` is the number of values with level `i`. For example,
872    /// `get(1)` returns the number of values with level 1.
873    ///
874    /// Returns `None` if the index is out of bounds.
875    pub fn get(&self, index: usize) -> Option<i64> {
876        self.inner.get(index).copied()
877    }
878
879    /// Adds the values from the other histogram to this histogram
880    ///
881    /// # Panics
882    /// If the histograms have different lengths
883    pub fn add(&mut self, other: &Self) {
884        assert_eq!(self.len(), other.len());
885        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
886            *dst += src;
887        }
888    }
889
890    /// return the length of the histogram
891    pub fn len(&self) -> usize {
892        self.inner.len()
893    }
894
895    /// returns if the histogram is empty
896    pub fn is_empty(&self) -> bool {
897        self.inner.is_empty()
898    }
899
900    /// Sets the values of all histogram levels to 0.
901    pub fn reset(&mut self) {
902        for value in self.inner.iter_mut() {
903            *value = 0;
904        }
905    }
906
907    /// Updates histogram values using provided repetition levels
908    ///
909    /// # Panics
910    /// if any of the levels is greater than the length of the histogram (
911    /// the argument supplied to [`Self::try_new`])
912    pub fn update_from_levels(&mut self, levels: &[i16]) {
913        for &level in levels {
914            self.inner[level as usize] += 1;
915        }
916    }
917}
918
919impl From<Vec<i64>> for LevelHistogram {
920    fn from(inner: Vec<i64>) -> Self {
921        Self { inner }
922    }
923}
924
925impl From<LevelHistogram> for Vec<i64> {
926    fn from(value: LevelHistogram) -> Self {
927        value.into_inner()
928    }
929}
930
931impl HeapSize for LevelHistogram {
932    fn heap_size(&self) -> usize {
933        self.inner.heap_size()
934    }
935}
936
937/// Represents common operations for a column chunk.
938impl ColumnChunkMetaData {
939    /// Returns builder for column chunk metadata.
940    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
941        ColumnChunkMetaDataBuilder::new(column_descr)
942    }
943
944    /// File where the column chunk is stored.
945    ///
946    /// If not set, assumed to belong to the same file as the metadata.
947    /// This path is relative to the current file.
948    pub fn file_path(&self) -> Option<&str> {
949        self.file_path.as_deref()
950    }
951
952    /// Byte offset of `ColumnMetaData` in `file_path()`.
953    ///
954    /// Note that the meaning of this field has been inconsistent between implementations
955    /// so its use has since been deprecated in the Parquet specification. Modern implementations
956    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
957    /// `ColumnChunk` struct.
958    pub fn file_offset(&self) -> i64 {
959        self.file_offset
960    }
961
962    /// Type of this column. Must be primitive.
963    pub fn column_type(&self) -> Type {
964        self.column_descr.physical_type()
965    }
966
967    /// Path (or identifier) of this column.
968    pub fn column_path(&self) -> &ColumnPath {
969        self.column_descr.path()
970    }
971
972    /// Descriptor for this column.
973    pub fn column_descr(&self) -> &ColumnDescriptor {
974        self.column_descr.as_ref()
975    }
976
977    /// Reference counted clone of descriptor for this column.
978    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
979        self.column_descr.clone()
980    }
981
982    /// All encodings used for this column.
983    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
984        self.encodings.encodings()
985    }
986
987    /// All encodings used for this column, returned as a bitmask.
988    pub fn encodings_mask(&self) -> &EncodingMask {
989        &self.encodings
990    }
991
992    /// Total number of values in this column chunk.
993    pub fn num_values(&self) -> i64 {
994        self.num_values
995    }
996
997    /// Compression for this column.
998    pub fn compression(&self) -> Compression {
999        self.compression
1000    }
1001
1002    /// Returns the total compressed data size of this column chunk.
1003    pub fn compressed_size(&self) -> i64 {
1004        self.total_compressed_size
1005    }
1006
1007    /// Returns the total uncompressed data size of this column chunk.
1008    pub fn uncompressed_size(&self) -> i64 {
1009        self.total_uncompressed_size
1010    }
1011
1012    /// Returns the offset for the column data.
1013    pub fn data_page_offset(&self) -> i64 {
1014        self.data_page_offset
1015    }
1016
1017    /// Returns the offset for the index page.
1018    pub fn index_page_offset(&self) -> Option<i64> {
1019        self.index_page_offset
1020    }
1021
1022    /// Returns the offset for the dictionary page, if any.
1023    pub fn dictionary_page_offset(&self) -> Option<i64> {
1024        self.dictionary_page_offset
1025    }
1026
1027    /// Returns the offset and length in bytes of the column chunk within the file
1028    pub fn byte_range(&self) -> (u64, u64) {
1029        let col_start = match self.dictionary_page_offset() {
1030            Some(dictionary_page_offset) => dictionary_page_offset,
1031            None => self.data_page_offset(),
1032        };
1033        let col_len = self.compressed_size();
1034        assert!(
1035            col_start >= 0 && col_len >= 0,
1036            "column start and length should not be negative"
1037        );
1038        (col_start as u64, col_len as u64)
1039    }
1040
1041    /// Returns statistics that are set for this column chunk,
1042    /// or `None` if no statistics are available.
1043    pub fn statistics(&self) -> Option<&Statistics> {
1044        self.statistics.as_ref()
1045    }
1046
1047    /// Returns geospatial statistics that are set for this column chunk,
1048    /// or `None` if no geospatial statistics are available.
1049    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1050        self.geo_statistics.as_deref()
1051    }
1052
1053    /// Returns the offset for the page encoding stats,
1054    /// or `None` if no page encoding stats are available.
1055    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1056        self.encoding_stats.as_ref()
1057    }
1058
1059    /// Returns the offset for the bloom filter.
1060    pub fn bloom_filter_offset(&self) -> Option<i64> {
1061        self.bloom_filter_offset
1062    }
1063
1064    /// Returns the offset for the bloom filter.
1065    pub fn bloom_filter_length(&self) -> Option<i32> {
1066        self.bloom_filter_length
1067    }
1068
1069    /// Returns the offset for the column index.
1070    pub fn column_index_offset(&self) -> Option<i64> {
1071        self.column_index_offset
1072    }
1073
1074    /// Returns the offset for the column index length.
1075    pub fn column_index_length(&self) -> Option<i32> {
1076        self.column_index_length
1077    }
1078
1079    /// Returns the range for the offset index if any
1080    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1081        let offset = u64::try_from(self.column_index_offset?).ok()?;
1082        let length = u64::try_from(self.column_index_length?).ok()?;
1083        Some(offset..(offset + length))
1084    }
1085
1086    /// Returns the offset for the offset index.
1087    pub fn offset_index_offset(&self) -> Option<i64> {
1088        self.offset_index_offset
1089    }
1090
1091    /// Returns the offset for the offset index length.
1092    pub fn offset_index_length(&self) -> Option<i32> {
1093        self.offset_index_length
1094    }
1095
1096    /// Returns the range for the offset index if any
1097    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1098        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1099        let length = u64::try_from(self.offset_index_length?).ok()?;
1100        Some(offset..(offset + length))
1101    }
1102
1103    /// Returns the number of bytes of variable length data after decoding.
1104    ///
1105    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1106    /// writers.
1107    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1108        self.unencoded_byte_array_data_bytes
1109    }
1110
1111    /// Returns the repetition level histogram.
1112    ///
1113    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1114    /// `vec[0]` indicates how many rows the page contains.
1115    /// This field may not be set by older writers.
1116    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1117        self.repetition_level_histogram.as_ref()
1118    }
1119
1120    /// Returns the definition level histogram.
1121    ///
1122    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1123    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1124    /// This field may not be set by older writers.
1125    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1126        self.definition_level_histogram.as_ref()
1127    }
1128
1129    /// Returns the encryption metadata for this column chunk.
1130    #[cfg(feature = "encryption")]
1131    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1132        self.column_crypto_metadata.as_deref()
1133    }
1134
1135    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1136    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1137        ColumnChunkMetaDataBuilder::from(self)
1138    }
1139}
1140
1141/// Builder for [`ColumnChunkMetaData`]
1142///
1143/// This builder is used to create a new column chunk metadata or modify an
1144/// existing one.
1145///
1146/// # Example
1147/// ```no_run
1148/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1149/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1150/// let column_chunk_metadata = get_column_chunk_metadata();
1151/// // create a new builder from existing column chunk metadata
1152/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1153/// // clear the statistics:
1154/// let column_chunk_metadata: ColumnChunkMetaData = builder
1155///   .clear_statistics()
1156///   .build()
1157///   .unwrap();
1158/// ```
1159pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1160
1161impl ColumnChunkMetaDataBuilder {
1162    /// Creates new column chunk metadata builder.
1163    ///
1164    /// See also [`ColumnChunkMetaData::builder`]
1165    fn new(column_descr: ColumnDescPtr) -> Self {
1166        Self(ColumnChunkMetaData {
1167            column_descr,
1168            encodings: Default::default(),
1169            file_path: None,
1170            file_offset: 0,
1171            num_values: 0,
1172            compression: Compression::UNCOMPRESSED,
1173            total_compressed_size: 0,
1174            total_uncompressed_size: 0,
1175            data_page_offset: 0,
1176            index_page_offset: None,
1177            dictionary_page_offset: None,
1178            statistics: None,
1179            geo_statistics: None,
1180            encoding_stats: None,
1181            bloom_filter_offset: None,
1182            bloom_filter_length: None,
1183            offset_index_offset: None,
1184            offset_index_length: None,
1185            column_index_offset: None,
1186            column_index_length: None,
1187            unencoded_byte_array_data_bytes: None,
1188            repetition_level_histogram: None,
1189            definition_level_histogram: None,
1190            #[cfg(feature = "encryption")]
1191            column_crypto_metadata: None,
1192            #[cfg(feature = "encryption")]
1193            encrypted_column_metadata: None,
1194        })
1195    }
1196
1197    /// Sets list of encodings for this column chunk.
1198    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1199        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1200        self
1201    }
1202
1203    /// Sets the encodings mask for this column chunk.
1204    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1205        self.0.encodings = encodings;
1206        self
1207    }
1208
1209    /// Sets optional file path for this column chunk.
1210    pub fn set_file_path(mut self, value: String) -> Self {
1211        self.0.file_path = Some(value);
1212        self
1213    }
1214
1215    /// Sets number of values.
1216    pub fn set_num_values(mut self, value: i64) -> Self {
1217        self.0.num_values = value;
1218        self
1219    }
1220
1221    /// Sets compression.
1222    pub fn set_compression(mut self, value: Compression) -> Self {
1223        self.0.compression = value;
1224        self
1225    }
1226
1227    /// Sets total compressed size in bytes.
1228    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1229        self.0.total_compressed_size = value;
1230        self
1231    }
1232
1233    /// Sets total uncompressed size in bytes.
1234    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1235        self.0.total_uncompressed_size = value;
1236        self
1237    }
1238
1239    /// Sets data page offset in bytes.
1240    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1241        self.0.data_page_offset = value;
1242        self
1243    }
1244
1245    /// Sets optional dictionary page offset in bytes.
1246    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1247        self.0.dictionary_page_offset = value;
1248        self
1249    }
1250
1251    /// Sets optional index page offset in bytes.
1252    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1253        self.0.index_page_offset = value;
1254        self
1255    }
1256
1257    /// Sets statistics for this column chunk.
1258    pub fn set_statistics(mut self, value: Statistics) -> Self {
1259        self.0.statistics = Some(value);
1260        self
1261    }
1262
1263    /// Sets geospatial statistics for this column chunk.
1264    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1265        self.0.geo_statistics = Some(value);
1266        self
1267    }
1268
1269    /// Clears the statistics for this column chunk.
1270    pub fn clear_statistics(mut self) -> Self {
1271        self.0.statistics = None;
1272        self
1273    }
1274
1275    /// Sets page encoding stats for this column chunk.
1276    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1277        self.0.encoding_stats = Some(value);
1278        self
1279    }
1280
1281    /// Clears the page encoding stats for this column chunk.
1282    pub fn clear_page_encoding_stats(mut self) -> Self {
1283        self.0.encoding_stats = None;
1284        self
1285    }
1286
1287    /// Sets optional bloom filter offset in bytes.
1288    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1289        self.0.bloom_filter_offset = value;
1290        self
1291    }
1292
1293    /// Sets optional bloom filter length in bytes.
1294    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1295        self.0.bloom_filter_length = value;
1296        self
1297    }
1298
1299    /// Sets optional offset index offset in bytes.
1300    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1301        self.0.offset_index_offset = value;
1302        self
1303    }
1304
1305    /// Sets optional offset index length in bytes.
1306    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1307        self.0.offset_index_length = value;
1308        self
1309    }
1310
1311    /// Sets optional column index offset in bytes.
1312    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1313        self.0.column_index_offset = value;
1314        self
1315    }
1316
1317    /// Sets optional column index length in bytes.
1318    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1319        self.0.column_index_length = value;
1320        self
1321    }
1322
1323    /// Sets optional length of variable length data in bytes.
1324    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1325        self.0.unencoded_byte_array_data_bytes = value;
1326        self
1327    }
1328
1329    /// Sets optional repetition level histogram
1330    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1331        self.0.repetition_level_histogram = value;
1332        self
1333    }
1334
1335    /// Sets optional repetition level histogram
1336    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1337        self.0.definition_level_histogram = value;
1338        self
1339    }
1340
1341    #[cfg(feature = "encryption")]
1342    /// Set the encryption metadata for an encrypted column
1343    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1344        self.0.column_crypto_metadata = value.map(Box::new);
1345        self
1346    }
1347
1348    #[cfg(feature = "encryption")]
1349    /// Set the encryption metadata for an encrypted column
1350    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1351        self.0.encrypted_column_metadata = value;
1352        self
1353    }
1354
1355    /// Builds column chunk metadata.
1356    pub fn build(self) -> Result<ColumnChunkMetaData> {
1357        Ok(self.0)
1358    }
1359}
1360
1361/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1362///
1363/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1364/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1365pub struct ColumnIndexBuilder {
1366    column_type: Type,
1367    null_pages: Vec<bool>,
1368    min_values: Vec<Vec<u8>>,
1369    max_values: Vec<Vec<u8>>,
1370    null_counts: Vec<i64>,
1371    boundary_order: BoundaryOrder,
1372    /// contains the concatenation of the histograms of all pages
1373    repetition_level_histograms: Option<Vec<i64>>,
1374    /// contains the concatenation of the histograms of all pages
1375    definition_level_histograms: Option<Vec<i64>>,
1376    /// Is the information in the builder valid?
1377    ///
1378    /// Set to `false` if any entry in the page doesn't have statistics for
1379    /// some reason, so statistics for that page won't be written to the file.
1380    /// This might happen if the page is entirely null, or
1381    /// is a floating point column without any non-nan values
1382    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1383    valid: bool,
1384}
1385
1386impl ColumnIndexBuilder {
1387    /// Creates a new column index builder.
1388    pub fn new(column_type: Type) -> Self {
1389        ColumnIndexBuilder {
1390            column_type,
1391            null_pages: Vec::new(),
1392            min_values: Vec::new(),
1393            max_values: Vec::new(),
1394            null_counts: Vec::new(),
1395            boundary_order: BoundaryOrder::UNORDERED,
1396            repetition_level_histograms: None,
1397            definition_level_histograms: None,
1398            valid: true,
1399        }
1400    }
1401
1402    /// Append statistics for the next page
1403    pub fn append(
1404        &mut self,
1405        null_page: bool,
1406        min_value: Vec<u8>,
1407        max_value: Vec<u8>,
1408        null_count: i64,
1409    ) {
1410        self.null_pages.push(null_page);
1411        self.min_values.push(min_value);
1412        self.max_values.push(max_value);
1413        self.null_counts.push(null_count);
1414    }
1415
1416    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1417    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1418    ///
1419    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1420    pub fn append_histograms(
1421        &mut self,
1422        repetition_level_histogram: &Option<LevelHistogram>,
1423        definition_level_histogram: &Option<LevelHistogram>,
1424    ) {
1425        if !self.valid {
1426            return;
1427        }
1428        if let Some(rep_lvl_hist) = repetition_level_histogram {
1429            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1430            hist.reserve(rep_lvl_hist.len());
1431            hist.extend(rep_lvl_hist.values());
1432        }
1433        if let Some(def_lvl_hist) = definition_level_histogram {
1434            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1435            hist.reserve(def_lvl_hist.len());
1436            hist.extend(def_lvl_hist.values());
1437        }
1438    }
1439
1440    /// Set the boundary order of the column index
1441    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1442        self.boundary_order = boundary_order;
1443    }
1444
1445    /// Mark this column index as invalid
1446    pub fn to_invalid(&mut self) {
1447        self.valid = false;
1448    }
1449
1450    /// Is the information in the builder valid?
1451    pub fn valid(&self) -> bool {
1452        self.valid
1453    }
1454
1455    /// Build and get the column index
1456    ///
1457    /// Note: callers should check [`Self::valid`] before calling this method
1458    pub fn build(self) -> Result<ColumnIndexMetaData> {
1459        Ok(match self.column_type {
1460            Type::BOOLEAN => {
1461                let index = self.build_page_index()?;
1462                ColumnIndexMetaData::BOOLEAN(index)
1463            }
1464            Type::INT32 => {
1465                let index = self.build_page_index()?;
1466                ColumnIndexMetaData::INT32(index)
1467            }
1468            Type::INT64 => {
1469                let index = self.build_page_index()?;
1470                ColumnIndexMetaData::INT64(index)
1471            }
1472            Type::INT96 => {
1473                let index = self.build_page_index()?;
1474                ColumnIndexMetaData::INT96(index)
1475            }
1476            Type::FLOAT => {
1477                let index = self.build_page_index()?;
1478                ColumnIndexMetaData::FLOAT(index)
1479            }
1480            Type::DOUBLE => {
1481                let index = self.build_page_index()?;
1482                ColumnIndexMetaData::DOUBLE(index)
1483            }
1484            Type::BYTE_ARRAY => {
1485                let index = self.build_byte_array_index()?;
1486                ColumnIndexMetaData::BYTE_ARRAY(index)
1487            }
1488            Type::FIXED_LEN_BYTE_ARRAY => {
1489                let index = self.build_byte_array_index()?;
1490                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1491            }
1492        })
1493    }
1494
1495    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1496    where
1497        T: ParquetValueType,
1498    {
1499        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1500        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1501
1502        PrimitiveColumnIndex::try_new(
1503            self.null_pages,
1504            self.boundary_order,
1505            Some(self.null_counts),
1506            self.repetition_level_histograms,
1507            self.definition_level_histograms,
1508            min_values,
1509            max_values,
1510        )
1511    }
1512
1513    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1514        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1515        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1516
1517        ByteArrayColumnIndex::try_new(
1518            self.null_pages,
1519            self.boundary_order,
1520            Some(self.null_counts),
1521            self.repetition_level_histograms,
1522            self.definition_level_histograms,
1523            min_values,
1524            max_values,
1525        )
1526    }
1527}
1528
1529impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1530    fn from(value: ColumnChunkMetaData) -> Self {
1531        ColumnChunkMetaDataBuilder(value)
1532    }
1533}
1534
1535/// Builder for offset index, part of the Parquet [PageIndex].
1536///
1537/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1538pub struct OffsetIndexBuilder {
1539    offset_array: Vec<i64>,
1540    compressed_page_size_array: Vec<i32>,
1541    first_row_index_array: Vec<i64>,
1542    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1543    current_first_row_index: i64,
1544}
1545
1546impl Default for OffsetIndexBuilder {
1547    fn default() -> Self {
1548        Self::new()
1549    }
1550}
1551
1552impl OffsetIndexBuilder {
1553    /// Creates a new offset index builder.
1554    pub fn new() -> Self {
1555        OffsetIndexBuilder {
1556            offset_array: Vec::new(),
1557            compressed_page_size_array: Vec::new(),
1558            first_row_index_array: Vec::new(),
1559            unencoded_byte_array_data_bytes_array: None,
1560            current_first_row_index: 0,
1561        }
1562    }
1563
1564    /// Append the row count of the next page.
1565    pub fn append_row_count(&mut self, row_count: i64) {
1566        let current_page_row_index = self.current_first_row_index;
1567        self.first_row_index_array.push(current_page_row_index);
1568        self.current_first_row_index += row_count;
1569    }
1570
1571    /// Append the offset and size of the next page.
1572    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1573        self.offset_array.push(offset);
1574        self.compressed_page_size_array.push(compressed_page_size);
1575    }
1576
1577    /// Append the unencoded byte array data bytes of the next page.
1578    pub fn append_unencoded_byte_array_data_bytes(
1579        &mut self,
1580        unencoded_byte_array_data_bytes: Option<i64>,
1581    ) {
1582        if let Some(val) = unencoded_byte_array_data_bytes {
1583            self.unencoded_byte_array_data_bytes_array
1584                .get_or_insert(Vec::new())
1585                .push(val);
1586        }
1587    }
1588
1589    /// Build and get the thrift metadata of offset index
1590    pub fn build(self) -> OffsetIndexMetaData {
1591        let locations = self
1592            .offset_array
1593            .iter()
1594            .zip(self.compressed_page_size_array.iter())
1595            .zip(self.first_row_index_array.iter())
1596            .map(|((offset, size), row_index)| PageLocation {
1597                offset: *offset,
1598                compressed_page_size: *size,
1599                first_row_index: *row_index,
1600            })
1601            .collect::<Vec<_>>();
1602        OffsetIndexMetaData {
1603            page_locations: locations,
1604            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1605        }
1606    }
1607}
1608
1609#[cfg(test)]
1610mod tests {
1611    use super::*;
1612    use crate::basic::{PageType, SortOrder};
1613    use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1614
1615    #[test]
1616    fn test_row_group_metadata_thrift_conversion() {
1617        let schema_descr = get_test_schema_descr();
1618
1619        let mut columns = vec![];
1620        for ptr in schema_descr.columns() {
1621            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1622            columns.push(column);
1623        }
1624        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1625            .set_num_rows(1000)
1626            .set_total_byte_size(2000)
1627            .set_column_metadata(columns)
1628            .set_ordinal(1)
1629            .build()
1630            .unwrap();
1631
1632        let mut buf = Vec::new();
1633        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1634        row_group_meta.write_thrift(&mut writer).unwrap();
1635
1636        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1637
1638        assert_eq!(row_group_res, row_group_meta);
1639    }
1640
1641    #[test]
1642    fn test_row_group_metadata_thrift_conversion_empty() {
1643        let schema_descr = get_test_schema_descr();
1644
1645        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1646
1647        assert!(row_group_meta.is_err());
1648        if let Err(e) = row_group_meta {
1649            assert_eq!(
1650                format!("{e}"),
1651                "Parquet error: Column length mismatch: 2 != 0"
1652            );
1653        }
1654    }
1655
1656    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1657    #[test]
1658    fn test_row_group_metadata_thrift_corrupted() {
1659        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1660            SchemaType::group_type_builder("schema")
1661                .with_fields(vec![
1662                    Arc::new(
1663                        SchemaType::primitive_type_builder("a", Type::INT32)
1664                            .build()
1665                            .unwrap(),
1666                    ),
1667                    Arc::new(
1668                        SchemaType::primitive_type_builder("b", Type::INT32)
1669                            .build()
1670                            .unwrap(),
1671                    ),
1672                ])
1673                .build()
1674                .unwrap(),
1675        )));
1676
1677        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1678            SchemaType::group_type_builder("schema")
1679                .with_fields(vec![
1680                    Arc::new(
1681                        SchemaType::primitive_type_builder("a", Type::INT32)
1682                            .build()
1683                            .unwrap(),
1684                    ),
1685                    Arc::new(
1686                        SchemaType::primitive_type_builder("b", Type::INT32)
1687                            .build()
1688                            .unwrap(),
1689                    ),
1690                    Arc::new(
1691                        SchemaType::primitive_type_builder("c", Type::INT32)
1692                            .build()
1693                            .unwrap(),
1694                    ),
1695                ])
1696                .build()
1697                .unwrap(),
1698        )));
1699
1700        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1701            .set_num_rows(1000)
1702            .set_total_byte_size(2000)
1703            .set_column_metadata(vec![
1704                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1705                    .build()
1706                    .unwrap(),
1707                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1708                    .build()
1709                    .unwrap(),
1710            ])
1711            .set_ordinal(1)
1712            .build()
1713            .unwrap();
1714        let mut buf = Vec::new();
1715        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1716        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1717
1718        let err = read_row_group(&mut buf, schema_descr_3cols)
1719            .unwrap_err()
1720            .to_string();
1721        assert_eq!(
1722            err,
1723            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1724        );
1725    }
1726
1727    #[test]
1728    fn test_column_chunk_metadata_thrift_conversion() {
1729        let column_descr = get_test_schema_descr().column(0);
1730        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1731            .set_encodings_mask(EncodingMask::new_from_encodings(
1732                [Encoding::PLAIN, Encoding::RLE].iter(),
1733            ))
1734            .set_file_path("file_path".to_owned())
1735            .set_num_values(1000)
1736            .set_compression(Compression::SNAPPY)
1737            .set_total_compressed_size(2000)
1738            .set_total_uncompressed_size(3000)
1739            .set_data_page_offset(4000)
1740            .set_dictionary_page_offset(Some(5000))
1741            .set_page_encoding_stats(vec![
1742                PageEncodingStats {
1743                    page_type: PageType::DATA_PAGE,
1744                    encoding: Encoding::PLAIN,
1745                    count: 3,
1746                },
1747                PageEncodingStats {
1748                    page_type: PageType::DATA_PAGE,
1749                    encoding: Encoding::RLE,
1750                    count: 5,
1751                },
1752            ])
1753            .set_bloom_filter_offset(Some(6000))
1754            .set_bloom_filter_length(Some(25))
1755            .set_offset_index_offset(Some(7000))
1756            .set_offset_index_length(Some(25))
1757            .set_column_index_offset(Some(8000))
1758            .set_column_index_length(Some(25))
1759            .set_unencoded_byte_array_data_bytes(Some(2000))
1760            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1761            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1762            .build()
1763            .unwrap();
1764
1765        let mut buf = Vec::new();
1766        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1767        col_metadata.write_thrift(&mut writer).unwrap();
1768        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1769
1770        assert_eq!(col_chunk_res, col_metadata);
1771    }
1772
1773    #[test]
1774    fn test_column_chunk_metadata_thrift_conversion_empty() {
1775        let column_descr = get_test_schema_descr().column(0);
1776
1777        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1778            .build()
1779            .unwrap();
1780
1781        let mut buf = Vec::new();
1782        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1783        col_metadata.write_thrift(&mut writer).unwrap();
1784        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1785
1786        assert_eq!(col_chunk_res, col_metadata);
1787    }
1788
1789    #[test]
1790    fn test_compressed_size() {
1791        let schema_descr = get_test_schema_descr();
1792
1793        let mut columns = vec![];
1794        for column_descr in schema_descr.columns() {
1795            let column = ColumnChunkMetaData::builder(column_descr.clone())
1796                .set_total_compressed_size(500)
1797                .set_total_uncompressed_size(700)
1798                .build()
1799                .unwrap();
1800            columns.push(column);
1801        }
1802        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1803            .set_num_rows(1000)
1804            .set_column_metadata(columns)
1805            .build()
1806            .unwrap();
1807
1808        let compressed_size_res: i64 = row_group_meta.compressed_size();
1809        let compressed_size_exp: i64 = 1000;
1810
1811        assert_eq!(compressed_size_res, compressed_size_exp);
1812    }
1813
1814    #[test]
1815    fn test_memory_size() {
1816        let schema_descr = get_test_schema_descr();
1817
1818        let columns = schema_descr
1819            .columns()
1820            .iter()
1821            .map(|column_descr| {
1822                ColumnChunkMetaData::builder(column_descr.clone())
1823                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1824                    .build()
1825            })
1826            .collect::<Result<Vec<_>>>()
1827            .unwrap();
1828        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1829            .set_num_rows(1000)
1830            .set_column_metadata(columns)
1831            .build()
1832            .unwrap();
1833        let row_group_meta = vec![row_group_meta];
1834
1835        let version = 2;
1836        let num_rows = 1000;
1837        let created_by = Some(String::from("test harness"));
1838        let key_value_metadata = Some(vec![KeyValue::new(
1839            String::from("Foo"),
1840            Some(String::from("bar")),
1841        )]);
1842        let column_orders = Some(vec![
1843            ColumnOrder::UNDEFINED,
1844            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1845        ]);
1846        let file_metadata = FileMetaData::new(
1847            version,
1848            num_rows,
1849            created_by,
1850            key_value_metadata,
1851            schema_descr.clone(),
1852            column_orders,
1853        );
1854
1855        // Now, add in Exact Statistics
1856        let columns_with_stats = schema_descr
1857            .columns()
1858            .iter()
1859            .map(|column_descr| {
1860                ColumnChunkMetaData::builder(column_descr.clone())
1861                    .set_statistics(Statistics::new::<i32>(
1862                        Some(0),
1863                        Some(100),
1864                        None,
1865                        None,
1866                        false,
1867                    ))
1868                    .build()
1869            })
1870            .collect::<Result<Vec<_>>>()
1871            .unwrap();
1872
1873        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1874            .set_num_rows(1000)
1875            .set_column_metadata(columns_with_stats)
1876            .build()
1877            .unwrap();
1878        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1879
1880        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1881            .set_row_groups(row_group_meta_with_stats)
1882            .build();
1883
1884        #[cfg(not(feature = "encryption"))]
1885        let base_expected_size = 2766;
1886        #[cfg(feature = "encryption")]
1887        let base_expected_size = 2934;
1888
1889        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1890
1891        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
1892        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1893        let column_index = column_index.build().unwrap();
1894        let native_index = match column_index {
1895            ColumnIndexMetaData::BOOLEAN(index) => index,
1896            _ => panic!("wrong type of column index"),
1897        };
1898
1899        // Now, add in OffsetIndex
1900        let mut offset_index = OffsetIndexBuilder::new();
1901        offset_index.append_row_count(1);
1902        offset_index.append_offset_and_size(2, 3);
1903        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1904        offset_index.append_row_count(1);
1905        offset_index.append_offset_and_size(2, 3);
1906        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1907        let offset_index = offset_index.build();
1908
1909        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1910            .set_row_groups(row_group_meta)
1911            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
1912            .set_offset_index(Some(vec![vec![offset_index]]))
1913            .build();
1914
1915        #[cfg(not(feature = "encryption"))]
1916        let bigger_expected_size = 3192;
1917        #[cfg(feature = "encryption")]
1918        let bigger_expected_size = 3360;
1919
1920        // more set fields means more memory usage
1921        assert!(bigger_expected_size > base_expected_size);
1922        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1923    }
1924
1925    #[test]
1926    #[cfg(feature = "encryption")]
1927    fn test_memory_size_with_decryptor() {
1928        use crate::encryption::decrypt::FileDecryptionProperties;
1929        use crate::file::metadata::thrift::encryption::AesGcmV1;
1930
1931        let schema_descr = get_test_schema_descr();
1932
1933        let columns = schema_descr
1934            .columns()
1935            .iter()
1936            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
1937            .collect::<Result<Vec<_>>>()
1938            .unwrap();
1939        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1940            .set_num_rows(1000)
1941            .set_column_metadata(columns)
1942            .build()
1943            .unwrap();
1944        let row_group_meta = vec![row_group_meta];
1945
1946        let version = 2;
1947        let num_rows = 1000;
1948        let aad_file_unique = vec![1u8; 8];
1949        let aad_prefix = vec![2u8; 8];
1950        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
1951            aad_prefix: Some(aad_prefix.clone()),
1952            aad_file_unique: Some(aad_file_unique.clone()),
1953            supply_aad_prefix: Some(true),
1954        });
1955        let footer_key_metadata = Some(vec![3u8; 8]);
1956        let file_metadata =
1957            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
1958                .with_encryption_algorithm(Some(encryption_algorithm))
1959                .with_footer_signing_key_metadata(footer_key_metadata.clone());
1960
1961        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1962            .set_row_groups(row_group_meta.clone())
1963            .build();
1964
1965        let base_expected_size = 2058;
1966        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
1967
1968        let footer_key = "0123456789012345".as_bytes();
1969        let column_key = "1234567890123450".as_bytes();
1970        let mut decryption_properties_builder =
1971            FileDecryptionProperties::builder(footer_key.to_vec())
1972                .with_aad_prefix(aad_prefix.clone());
1973        for column in schema_descr.columns() {
1974            decryption_properties_builder = decryption_properties_builder
1975                .with_column_key(&column.path().string(), column_key.to_vec());
1976        }
1977        let decryption_properties = decryption_properties_builder.build().unwrap();
1978        let decryptor = FileDecryptor::new(
1979            &decryption_properties,
1980            footer_key_metadata.as_deref(),
1981            aad_file_unique,
1982            aad_prefix,
1983        )
1984        .unwrap();
1985
1986        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1987            .set_row_groups(row_group_meta.clone())
1988            .set_file_decryptor(Some(decryptor))
1989            .build();
1990
1991        let expected_size_with_decryptor = 3072;
1992        assert!(expected_size_with_decryptor > base_expected_size);
1993
1994        assert_eq!(
1995            parquet_meta_data.memory_size(),
1996            expected_size_with_decryptor
1997        );
1998    }
1999
2000    /// Returns sample schema descriptor so we can create column metadata.
2001    fn get_test_schema_descr() -> SchemaDescPtr {
2002        let schema = SchemaType::group_type_builder("schema")
2003            .with_fields(vec![
2004                Arc::new(
2005                    SchemaType::primitive_type_builder("a", Type::INT32)
2006                        .build()
2007                        .unwrap(),
2008                ),
2009                Arc::new(
2010                    SchemaType::primitive_type_builder("b", Type::INT32)
2011                        .build()
2012                        .unwrap(),
2013                ),
2014            ])
2015            .build()
2016            .unwrap();
2017
2018        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2019    }
2020}