parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//! # Examples
46//!
47//! Please see [`external_metadata.rs`]
48//!
49//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
50//!
51//! # Metadata Encodings and Structures
52//!
53//! There are three different encodings of Parquet Metadata in this crate:
54//!
55//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
56//!    [parquet.thrift]
57//!
58//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
59//!    from [parquet.thrift]. These structures are low level and mirror
60//!    the thrift definitions.
61//!
62//! 3. [`file::metadata`] (this module): Easier to use Rust structures
63//!    with a more idiomatic API. Note that, confusingly, some but not all
64//!    of these structures have the same name as the [`format`] structures.
65//!
66//! [`file::metadata`]: crate::file::metadata
67//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
68//!
69//! Graphically, this is how the different structures relate to each other:
70//!
71//! ```text
72//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
73//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
74//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
75//!                            └──────────────┘     │         └───────────────────────┘ │
76//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
77//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
78//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
79//!                                     ...         │                   ...             │
80//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
81//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
82//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
83//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
84//!
85//!                          format::meta structures          file::metadata structures
86//!
87//!                         * Same name, different struct
88//! ```
89mod footer_tail;
90mod memory;
91mod parser;
92mod push_decoder;
93pub(crate) mod reader;
94pub(crate) mod thrift;
95mod writer;
96
97use crate::basic::{EncodingMask, PageType};
98#[cfg(feature = "encryption")]
99use crate::encryption::decrypt::FileDecryptor;
100#[cfg(feature = "encryption")]
101use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
102pub(crate) use crate::file::metadata::memory::HeapSize;
103#[cfg(feature = "encryption")]
104use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
105use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
106use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
107use crate::file::statistics::Statistics;
108use crate::geospatial::statistics as geo_statistics;
109use crate::schema::types::{
110    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
111    Type as SchemaType,
112};
113use crate::thrift_struct;
114use crate::{
115    basic::BoundaryOrder,
116    errors::{ParquetError, Result},
117};
118use crate::{
119    basic::{ColumnOrder, Compression, Encoding, Type},
120    parquet_thrift::{
121        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
122        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
123    },
124};
125use crate::{
126    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
127};
128
129pub use footer_tail::FooterTail;
130pub use push_decoder::ParquetMetaDataPushDecoder;
131pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
132use std::io::Write;
133use std::ops::Range;
134use std::sync::Arc;
135pub use writer::ParquetMetaDataWriter;
136pub(crate) use writer::ThriftMetadataWriter;
137
138/// Page level statistics for each column chunk of each row group.
139///
140/// This structure is an in-memory representation of multiple [`ColumnIndex`]
141/// structures in a parquet file footer, as described in the Parquet [PageIndex
142/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
143/// particular column chunk.
144///
145/// `column_index[row_group_number][column_number]` holds the
146/// [`ColumnIndex`] corresponding to column `column_number` of row group
147/// `row_group_number`.
148///
149/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
150/// column in the third row group of the parquet file.
151///
152/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
153/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
154pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
155
156/// [`OffsetIndexMetaData`] for each data page of each row group of each column
157///
158/// This structure is the parsed representation of the [`OffsetIndex`] from the
159/// Parquet file footer, as described in the Parquet [PageIndex documentation].
160///
161/// `offset_index[row_group_number][column_number]` holds
162/// the [`OffsetIndexMetaData`] corresponding to column
163/// `column_number`of row group `row_group_number`.
164///
165/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
166/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
167pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
168
169/// Parsed metadata for a single Parquet file
170///
171/// This structure is stored in the footer of Parquet files, in the format
172/// defined by [`parquet.thrift`].
173///
174/// # Overview
175/// The fields of this structure are:
176/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
177/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
178/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
179///
180/// This structure is read by the various readers in this crate or can be read
181/// directly from a file using the [`ParquetMetaDataReader`] struct.
182///
183/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
184///
185/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
186#[derive(Debug, Clone, PartialEq)]
187pub struct ParquetMetaData {
188    /// File level metadata
189    file_metadata: FileMetaData,
190    /// Row group metadata
191    row_groups: Vec<RowGroupMetaData>,
192    /// Page level index for each page in each column chunk
193    column_index: Option<ParquetColumnIndex>,
194    /// Offset index for each page in each column chunk
195    offset_index: Option<ParquetOffsetIndex>,
196    /// Optional file decryptor
197    #[cfg(feature = "encryption")]
198    file_decryptor: Option<Box<FileDecryptor>>,
199}
200
201impl ParquetMetaData {
202    /// Creates Parquet metadata from file metadata and a list of row
203    /// group metadata
204    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
205        ParquetMetaData {
206            file_metadata,
207            row_groups,
208            column_index: None,
209            offset_index: None,
210            #[cfg(feature = "encryption")]
211            file_decryptor: None,
212        }
213    }
214
215    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
216    /// encrypted data.
217    #[cfg(feature = "encryption")]
218    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
219        self.file_decryptor = file_decryptor.map(Box::new);
220    }
221
222    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
223    pub fn into_builder(self) -> ParquetMetaDataBuilder {
224        self.into()
225    }
226
227    /// Returns file metadata as reference.
228    pub fn file_metadata(&self) -> &FileMetaData {
229        &self.file_metadata
230    }
231
232    /// Returns file decryptor as reference.
233    #[cfg(feature = "encryption")]
234    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
235        self.file_decryptor.as_deref()
236    }
237
238    /// Returns number of row groups in this file.
239    pub fn num_row_groups(&self) -> usize {
240        self.row_groups.len()
241    }
242
243    /// Returns row group metadata for `i`th position.
244    /// Position should be less than number of row groups `num_row_groups`.
245    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
246        &self.row_groups[i]
247    }
248
249    /// Returns slice of row groups in this file.
250    pub fn row_groups(&self) -> &[RowGroupMetaData] {
251        &self.row_groups
252    }
253
254    /// Returns the column index for this file if loaded
255    ///
256    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
257    /// [ArrowReaderOptions::with_page_index] was set to false.
258    ///
259    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
260    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
261        self.column_index.as_ref()
262    }
263
264    /// Returns offset indexes in this file, if loaded
265    ///
266    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
267    /// [ArrowReaderOptions::with_page_index] was set to false.
268    ///
269    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
270    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
271        self.offset_index.as_ref()
272    }
273
274    /// Estimate of the bytes allocated to store `ParquetMetadata`
275    ///
276    /// # Notes:
277    ///
278    /// 1. Includes size of self
279    ///
280    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
281    ///    [`RowGroupMetaData`].
282    ///
283    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
284    ///    means `memory_size` will over estimate the memory size if such pointers
285    ///    are shared.
286    ///
287    /// 4. Does not include any allocator overheads
288    pub fn memory_size(&self) -> usize {
289        #[cfg(feature = "encryption")]
290        let encryption_size = self.file_decryptor.heap_size();
291        #[cfg(not(feature = "encryption"))]
292        let encryption_size = 0usize;
293
294        std::mem::size_of::<Self>()
295            + self.file_metadata.heap_size()
296            + self.row_groups.heap_size()
297            + self.column_index.heap_size()
298            + self.offset_index.heap_size()
299            + encryption_size
300    }
301
302    /// Override the column index
303    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
304        self.column_index = index;
305    }
306
307    /// Override the offset index
308    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
309        self.offset_index = index;
310    }
311}
312
313/// A builder for creating / manipulating [`ParquetMetaData`]
314///
315/// # Example creating a new [`ParquetMetaData`]
316///
317///```no_run
318/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
319/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
320/// // Create a new builder given the file metadata
321/// let file_metadata = get_file_metadata();
322/// // Create a row group
323/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
324///    .set_num_rows(100)
325///    // ... (A real row group needs more than just the number of rows)
326///    .build()
327///    .unwrap();
328/// // Create the final metadata
329/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
330///   .add_row_group(row_group)
331///   .build();
332/// ```
333///
334/// # Example modifying an existing [`ParquetMetaData`]
335/// ```no_run
336/// # use parquet::file::metadata::ParquetMetaData;
337/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
338/// // Modify the metadata so only the last RowGroup remains
339/// let metadata: ParquetMetaData = load_metadata();
340/// let mut builder = metadata.into_builder();
341///
342/// // Take existing row groups to modify
343/// let mut row_groups = builder.take_row_groups();
344/// let last_row_group = row_groups.pop().unwrap();
345///
346/// let metadata = builder
347///   .add_row_group(last_row_group)
348///   .build();
349/// ```
350pub struct ParquetMetaDataBuilder(ParquetMetaData);
351
352impl ParquetMetaDataBuilder {
353    /// Create a new builder from a file metadata, with no row groups
354    pub fn new(file_meta_data: FileMetaData) -> Self {
355        Self(ParquetMetaData::new(file_meta_data, vec![]))
356    }
357
358    /// Create a new builder from an existing ParquetMetaData
359    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
360        Self(metadata)
361    }
362
363    /// Adds a row group to the metadata
364    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
365        self.0.row_groups.push(row_group);
366        self
367    }
368
369    /// Sets all the row groups to the specified list
370    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
371        self.0.row_groups = row_groups;
372        self
373    }
374
375    /// Takes ownership of the row groups in this builder, and clears the list
376    /// of row groups.
377    ///
378    /// This can be used for more efficient creation of a new ParquetMetaData
379    /// from an existing one.
380    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
381        std::mem::take(&mut self.0.row_groups)
382    }
383
384    /// Return a reference to the current row groups
385    pub fn row_groups(&self) -> &[RowGroupMetaData] {
386        &self.0.row_groups
387    }
388
389    /// Sets the column index
390    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
391        self.0.column_index = column_index;
392        self
393    }
394
395    /// Returns the current column index from the builder, replacing it with `None`
396    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
397        std::mem::take(&mut self.0.column_index)
398    }
399
400    /// Return a reference to the current column index, if any
401    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
402        self.0.column_index.as_ref()
403    }
404
405    /// Sets the offset index
406    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
407        self.0.offset_index = offset_index;
408        self
409    }
410
411    /// Returns the current offset index from the builder, replacing it with `None`
412    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
413        std::mem::take(&mut self.0.offset_index)
414    }
415
416    /// Return a reference to the current offset index, if any
417    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
418        self.0.offset_index.as_ref()
419    }
420
421    /// Sets the file decryptor needed to decrypt this metadata.
422    #[cfg(feature = "encryption")]
423    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
424        self.0.with_file_decryptor(file_decryptor);
425        self
426    }
427
428    /// Creates a new ParquetMetaData from the builder
429    pub fn build(self) -> ParquetMetaData {
430        let Self(metadata) = self;
431        metadata
432    }
433}
434
435impl From<ParquetMetaData> for ParquetMetaDataBuilder {
436    fn from(meta_data: ParquetMetaData) -> Self {
437        Self(meta_data)
438    }
439}
440
441thrift_struct!(
442/// A key-value pair for [`FileMetaData`].
443pub struct KeyValue {
444  1: required string key
445  2: optional string value
446}
447);
448
449impl KeyValue {
450    /// Create a new key value pair
451    pub fn new<F2>(key: String, value: F2) -> KeyValue
452    where
453        F2: Into<Option<String>>,
454    {
455        KeyValue {
456            key,
457            value: value.into(),
458        }
459    }
460}
461
462thrift_struct!(
463/// PageEncodingStats for a column chunk and data page.
464pub struct PageEncodingStats {
465  1: required PageType page_type;
466  2: required Encoding encoding;
467  3: required i32 count;
468}
469);
470
471/// Reference counted pointer for [`FileMetaData`].
472pub type FileMetaDataPtr = Arc<FileMetaData>;
473
474/// File level metadata for a Parquet file.
475///
476/// Includes the version of the file, metadata, number of rows, schema, and column orders
477#[derive(Debug, Clone, PartialEq)]
478pub struct FileMetaData {
479    version: i32,
480    num_rows: i64,
481    created_by: Option<String>,
482    key_value_metadata: Option<Vec<KeyValue>>,
483    schema_descr: SchemaDescPtr,
484    column_orders: Option<Vec<ColumnOrder>>,
485    #[cfg(feature = "encryption")]
486    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
487    #[cfg(feature = "encryption")]
488    footer_signing_key_metadata: Option<Vec<u8>>,
489}
490
491impl FileMetaData {
492    /// Creates new file metadata.
493    pub fn new(
494        version: i32,
495        num_rows: i64,
496        created_by: Option<String>,
497        key_value_metadata: Option<Vec<KeyValue>>,
498        schema_descr: SchemaDescPtr,
499        column_orders: Option<Vec<ColumnOrder>>,
500    ) -> Self {
501        FileMetaData {
502            version,
503            num_rows,
504            created_by,
505            key_value_metadata,
506            schema_descr,
507            column_orders,
508            #[cfg(feature = "encryption")]
509            encryption_algorithm: None,
510            #[cfg(feature = "encryption")]
511            footer_signing_key_metadata: None,
512        }
513    }
514
515    #[cfg(feature = "encryption")]
516    pub(crate) fn with_encryption_algorithm(
517        mut self,
518        encryption_algorithm: Option<EncryptionAlgorithm>,
519    ) -> Self {
520        self.encryption_algorithm = encryption_algorithm.map(Box::new);
521        self
522    }
523
524    #[cfg(feature = "encryption")]
525    pub(crate) fn with_footer_signing_key_metadata(
526        mut self,
527        footer_signing_key_metadata: Option<Vec<u8>>,
528    ) -> Self {
529        self.footer_signing_key_metadata = footer_signing_key_metadata;
530        self
531    }
532
533    /// Returns version of this file.
534    pub fn version(&self) -> i32 {
535        self.version
536    }
537
538    /// Returns number of rows in the file.
539    pub fn num_rows(&self) -> i64 {
540        self.num_rows
541    }
542
543    /// String message for application that wrote this file.
544    ///
545    /// This should have the following format:
546    /// `<application> version <application version> (build <application build hash>)`.
547    ///
548    /// ```shell
549    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
550    /// ```
551    pub fn created_by(&self) -> Option<&str> {
552        self.created_by.as_deref()
553    }
554
555    /// Returns key_value_metadata of this file.
556    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
557        self.key_value_metadata.as_ref()
558    }
559
560    /// Returns Parquet [`Type`] that describes schema in this file.
561    ///
562    /// [`Type`]: crate::schema::types::Type
563    pub fn schema(&self) -> &SchemaType {
564        self.schema_descr.root_schema()
565    }
566
567    /// Returns a reference to schema descriptor.
568    pub fn schema_descr(&self) -> &SchemaDescriptor {
569        &self.schema_descr
570    }
571
572    /// Returns reference counted clone for schema descriptor.
573    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
574        self.schema_descr.clone()
575    }
576
577    /// Column (sort) order used for `min` and `max` values of each column in this file.
578    ///
579    /// Each column order corresponds to one column, determined by its position in the
580    /// list, matching the position of the column in the schema.
581    ///
582    /// When `None` is returned, there are no column orders available, and each column
583    /// should be assumed to have undefined (legacy) column order.
584    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
585        self.column_orders.as_ref()
586    }
587
588    /// Returns column order for `i`th column in this file.
589    /// If column orders are not available, returns undefined (legacy) column order.
590    pub fn column_order(&self, i: usize) -> ColumnOrder {
591        self.column_orders
592            .as_ref()
593            .map(|data| data[i])
594            .unwrap_or(ColumnOrder::UNDEFINED)
595    }
596}
597
598thrift_struct!(
599/// Sort order within a RowGroup of a leaf column
600pub struct SortingColumn {
601  /// The ordinal position of the column (in this row group)
602  1: required i32 column_idx
603
604  /// If true, indicates this column is sorted in descending order.
605  2: required bool descending
606
607  /// If true, nulls will come before non-null values, otherwise,
608  /// nulls go at the end. */
609  3: required bool nulls_first
610}
611);
612
613/// Reference counted pointer for [`RowGroupMetaData`].
614pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
615
616/// Metadata for a row group
617///
618/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
619/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
620#[derive(Debug, Clone, PartialEq)]
621pub struct RowGroupMetaData {
622    columns: Vec<ColumnChunkMetaData>,
623    num_rows: i64,
624    sorting_columns: Option<Vec<SortingColumn>>,
625    total_byte_size: i64,
626    schema_descr: SchemaDescPtr,
627    /// We can't infer from file offset of first column since there may empty columns in row group.
628    file_offset: Option<i64>,
629    /// Ordinal position of this row group in file
630    ordinal: Option<i16>,
631}
632
633impl RowGroupMetaData {
634    /// Returns builder for row group metadata.
635    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
636        RowGroupMetaDataBuilder::new(schema_descr)
637    }
638
639    /// Number of columns in this row group.
640    pub fn num_columns(&self) -> usize {
641        self.columns.len()
642    }
643
644    /// Returns column chunk metadata for `i`th column.
645    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
646        &self.columns[i]
647    }
648
649    /// Returns slice of column chunk metadata.
650    pub fn columns(&self) -> &[ColumnChunkMetaData] {
651        &self.columns
652    }
653
654    /// Returns mutable slice of column chunk metadata.
655    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
656        &mut self.columns
657    }
658
659    /// Number of rows in this row group.
660    pub fn num_rows(&self) -> i64 {
661        self.num_rows
662    }
663
664    /// Returns the sort ordering of the rows in this RowGroup if any
665    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
666        self.sorting_columns.as_ref()
667    }
668
669    /// Total byte size of all uncompressed column data in this row group.
670    pub fn total_byte_size(&self) -> i64 {
671        self.total_byte_size
672    }
673
674    /// Total size of all compressed column data in this row group.
675    pub fn compressed_size(&self) -> i64 {
676        self.columns.iter().map(|c| c.total_compressed_size).sum()
677    }
678
679    /// Returns reference to a schema descriptor.
680    pub fn schema_descr(&self) -> &SchemaDescriptor {
681        self.schema_descr.as_ref()
682    }
683
684    /// Returns reference counted clone of schema descriptor.
685    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
686        self.schema_descr.clone()
687    }
688
689    /// Returns ordinal position of this row group in file.
690    ///
691    /// For example if this is the first row group in the file, this will return 0.
692    /// If this is the second row group in the file, this will return 1.
693    #[inline(always)]
694    pub fn ordinal(&self) -> Option<i16> {
695        self.ordinal
696    }
697
698    /// Returns file offset of this row group in file.
699    #[inline(always)]
700    pub fn file_offset(&self) -> Option<i64> {
701        self.file_offset
702    }
703
704    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
705    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
706        RowGroupMetaDataBuilder(self)
707    }
708}
709
710/// Builder for row group metadata.
711pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
712
713impl RowGroupMetaDataBuilder {
714    /// Creates new builder from schema descriptor.
715    fn new(schema_descr: SchemaDescPtr) -> Self {
716        Self(RowGroupMetaData {
717            columns: Vec::with_capacity(schema_descr.num_columns()),
718            schema_descr,
719            file_offset: None,
720            num_rows: 0,
721            sorting_columns: None,
722            total_byte_size: 0,
723            ordinal: None,
724        })
725    }
726
727    /// Sets number of rows in this row group.
728    pub fn set_num_rows(mut self, value: i64) -> Self {
729        self.0.num_rows = value;
730        self
731    }
732
733    /// Sets the sorting order for columns
734    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
735        self.0.sorting_columns = value;
736        self
737    }
738
739    /// Sets total size in bytes for this row group.
740    pub fn set_total_byte_size(mut self, value: i64) -> Self {
741        self.0.total_byte_size = value;
742        self
743    }
744
745    /// Takes ownership of the the column metadata in this builder, and clears
746    /// the list of columns.
747    ///
748    /// This can be used for more efficient creation of a new RowGroupMetaData
749    /// from an existing one.
750    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
751        std::mem::take(&mut self.0.columns)
752    }
753
754    /// Sets column metadata for this row group.
755    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
756        self.0.columns = value;
757        self
758    }
759
760    /// Adds a column metadata to this row group
761    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
762        self.0.columns.push(value);
763        self
764    }
765
766    /// Sets ordinal for this row group.
767    pub fn set_ordinal(mut self, value: i16) -> Self {
768        self.0.ordinal = Some(value);
769        self
770    }
771
772    /// Sets file offset for this row group.
773    pub fn set_file_offset(mut self, value: i64) -> Self {
774        self.0.file_offset = Some(value);
775        self
776    }
777
778    /// Builds row group metadata.
779    pub fn build(self) -> Result<RowGroupMetaData> {
780        if self.0.schema_descr.num_columns() != self.0.columns.len() {
781            return Err(general_err!(
782                "Column length mismatch: {} != {}",
783                self.0.schema_descr.num_columns(),
784                self.0.columns.len()
785            ));
786        }
787
788        Ok(self.0)
789    }
790
791    /// Build row group metadata without validation.
792    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
793        self.0
794    }
795}
796
797/// Metadata for a column chunk.
798#[derive(Debug, Clone, PartialEq)]
799pub struct ColumnChunkMetaData {
800    column_descr: ColumnDescPtr,
801    encodings: EncodingMask,
802    file_path: Option<String>,
803    file_offset: i64,
804    num_values: i64,
805    compression: Compression,
806    total_compressed_size: i64,
807    total_uncompressed_size: i64,
808    data_page_offset: i64,
809    index_page_offset: Option<i64>,
810    dictionary_page_offset: Option<i64>,
811    statistics: Option<Statistics>,
812    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
813    encoding_stats: Option<Vec<PageEncodingStats>>,
814    bloom_filter_offset: Option<i64>,
815    bloom_filter_length: Option<i32>,
816    offset_index_offset: Option<i64>,
817    offset_index_length: Option<i32>,
818    column_index_offset: Option<i64>,
819    column_index_length: Option<i32>,
820    unencoded_byte_array_data_bytes: Option<i64>,
821    repetition_level_histogram: Option<LevelHistogram>,
822    definition_level_histogram: Option<LevelHistogram>,
823    #[cfg(feature = "encryption")]
824    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
825    #[cfg(feature = "encryption")]
826    encrypted_column_metadata: Option<Vec<u8>>,
827}
828
829/// Histograms for repetition and definition levels.
830///
831/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
832/// values at level `i`.
833///
834/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
835/// number of rows with level 1, and so on.
836///
837#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
838pub struct LevelHistogram {
839    inner: Vec<i64>,
840}
841
842impl LevelHistogram {
843    /// Creates a new level histogram data.
844    ///
845    /// Length will be `max_level + 1`.
846    ///
847    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
848    pub fn try_new(max_level: i16) -> Option<Self> {
849        if max_level > 0 {
850            Some(Self {
851                inner: vec![0; max_level as usize + 1],
852            })
853        } else {
854            None
855        }
856    }
857    /// Returns a reference to the the histogram's values.
858    pub fn values(&self) -> &[i64] {
859        &self.inner
860    }
861
862    /// Return the inner vector, consuming self
863    pub fn into_inner(self) -> Vec<i64> {
864        self.inner
865    }
866
867    /// Returns the histogram value at the given index.
868    ///
869    /// The value of `i` is the number of values with level `i`. For example,
870    /// `get(1)` returns the number of values with level 1.
871    ///
872    /// Returns `None` if the index is out of bounds.
873    pub fn get(&self, index: usize) -> Option<i64> {
874        self.inner.get(index).copied()
875    }
876
877    /// Adds the values from the other histogram to this histogram
878    ///
879    /// # Panics
880    /// If the histograms have different lengths
881    pub fn add(&mut self, other: &Self) {
882        assert_eq!(self.len(), other.len());
883        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
884            *dst += src;
885        }
886    }
887
888    /// return the length of the histogram
889    pub fn len(&self) -> usize {
890        self.inner.len()
891    }
892
893    /// returns if the histogram is empty
894    pub fn is_empty(&self) -> bool {
895        self.inner.is_empty()
896    }
897
898    /// Sets the values of all histogram levels to 0.
899    pub fn reset(&mut self) {
900        for value in self.inner.iter_mut() {
901            *value = 0;
902        }
903    }
904
905    /// Updates histogram values using provided repetition levels
906    ///
907    /// # Panics
908    /// if any of the levels is greater than the length of the histogram (
909    /// the argument supplied to [`Self::try_new`])
910    pub fn update_from_levels(&mut self, levels: &[i16]) {
911        for &level in levels {
912            self.inner[level as usize] += 1;
913        }
914    }
915}
916
917impl From<Vec<i64>> for LevelHistogram {
918    fn from(inner: Vec<i64>) -> Self {
919        Self { inner }
920    }
921}
922
923impl From<LevelHistogram> for Vec<i64> {
924    fn from(value: LevelHistogram) -> Self {
925        value.into_inner()
926    }
927}
928
929impl HeapSize for LevelHistogram {
930    fn heap_size(&self) -> usize {
931        self.inner.heap_size()
932    }
933}
934
935/// Represents common operations for a column chunk.
936impl ColumnChunkMetaData {
937    /// Returns builder for column chunk metadata.
938    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
939        ColumnChunkMetaDataBuilder::new(column_descr)
940    }
941
942    /// File where the column chunk is stored.
943    ///
944    /// If not set, assumed to belong to the same file as the metadata.
945    /// This path is relative to the current file.
946    pub fn file_path(&self) -> Option<&str> {
947        self.file_path.as_deref()
948    }
949
950    /// Byte offset of `ColumnMetaData` in `file_path()`.
951    ///
952    /// Note that the meaning of this field has been inconsistent between implementations
953    /// so its use has since been deprecated in the Parquet specification. Modern implementations
954    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
955    /// `ColumnChunk` struct.
956    pub fn file_offset(&self) -> i64 {
957        self.file_offset
958    }
959
960    /// Type of this column. Must be primitive.
961    pub fn column_type(&self) -> Type {
962        self.column_descr.physical_type()
963    }
964
965    /// Path (or identifier) of this column.
966    pub fn column_path(&self) -> &ColumnPath {
967        self.column_descr.path()
968    }
969
970    /// Descriptor for this column.
971    pub fn column_descr(&self) -> &ColumnDescriptor {
972        self.column_descr.as_ref()
973    }
974
975    /// Reference counted clone of descriptor for this column.
976    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
977        self.column_descr.clone()
978    }
979
980    /// All encodings used for this column.
981    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
982        self.encodings.encodings()
983    }
984
985    /// All encodings used for this column, returned as a bitmask.
986    pub fn encodings_mask(&self) -> &EncodingMask {
987        &self.encodings
988    }
989
990    /// Total number of values in this column chunk.
991    pub fn num_values(&self) -> i64 {
992        self.num_values
993    }
994
995    /// Compression for this column.
996    pub fn compression(&self) -> Compression {
997        self.compression
998    }
999
1000    /// Returns the total compressed data size of this column chunk.
1001    pub fn compressed_size(&self) -> i64 {
1002        self.total_compressed_size
1003    }
1004
1005    /// Returns the total uncompressed data size of this column chunk.
1006    pub fn uncompressed_size(&self) -> i64 {
1007        self.total_uncompressed_size
1008    }
1009
1010    /// Returns the offset for the column data.
1011    pub fn data_page_offset(&self) -> i64 {
1012        self.data_page_offset
1013    }
1014
1015    /// Returns the offset for the index page.
1016    pub fn index_page_offset(&self) -> Option<i64> {
1017        self.index_page_offset
1018    }
1019
1020    /// Returns the offset for the dictionary page, if any.
1021    pub fn dictionary_page_offset(&self) -> Option<i64> {
1022        self.dictionary_page_offset
1023    }
1024
1025    /// Returns the offset and length in bytes of the column chunk within the file
1026    pub fn byte_range(&self) -> (u64, u64) {
1027        let col_start = match self.dictionary_page_offset() {
1028            Some(dictionary_page_offset) => dictionary_page_offset,
1029            None => self.data_page_offset(),
1030        };
1031        let col_len = self.compressed_size();
1032        assert!(
1033            col_start >= 0 && col_len >= 0,
1034            "column start and length should not be negative"
1035        );
1036        (col_start as u64, col_len as u64)
1037    }
1038
1039    /// Returns statistics that are set for this column chunk,
1040    /// or `None` if no statistics are available.
1041    pub fn statistics(&self) -> Option<&Statistics> {
1042        self.statistics.as_ref()
1043    }
1044
1045    /// Returns geospatial statistics that are set for this column chunk,
1046    /// or `None` if no geospatial statistics are available.
1047    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1048        self.geo_statistics.as_deref()
1049    }
1050
1051    /// Returns the offset for the page encoding stats,
1052    /// or `None` if no page encoding stats are available.
1053    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1054        self.encoding_stats.as_ref()
1055    }
1056
1057    /// Returns the offset for the bloom filter.
1058    pub fn bloom_filter_offset(&self) -> Option<i64> {
1059        self.bloom_filter_offset
1060    }
1061
1062    /// Returns the offset for the bloom filter.
1063    pub fn bloom_filter_length(&self) -> Option<i32> {
1064        self.bloom_filter_length
1065    }
1066
1067    /// Returns the offset for the column index.
1068    pub fn column_index_offset(&self) -> Option<i64> {
1069        self.column_index_offset
1070    }
1071
1072    /// Returns the offset for the column index length.
1073    pub fn column_index_length(&self) -> Option<i32> {
1074        self.column_index_length
1075    }
1076
1077    /// Returns the range for the offset index if any
1078    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1079        let offset = u64::try_from(self.column_index_offset?).ok()?;
1080        let length = u64::try_from(self.column_index_length?).ok()?;
1081        Some(offset..(offset + length))
1082    }
1083
1084    /// Returns the offset for the offset index.
1085    pub fn offset_index_offset(&self) -> Option<i64> {
1086        self.offset_index_offset
1087    }
1088
1089    /// Returns the offset for the offset index length.
1090    pub fn offset_index_length(&self) -> Option<i32> {
1091        self.offset_index_length
1092    }
1093
1094    /// Returns the range for the offset index if any
1095    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1096        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1097        let length = u64::try_from(self.offset_index_length?).ok()?;
1098        Some(offset..(offset + length))
1099    }
1100
1101    /// Returns the number of bytes of variable length data after decoding.
1102    ///
1103    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1104    /// writers.
1105    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1106        self.unencoded_byte_array_data_bytes
1107    }
1108
1109    /// Returns the repetition level histogram.
1110    ///
1111    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1112    /// `vec[0]` indicates how many rows the page contains.
1113    /// This field may not be set by older writers.
1114    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1115        self.repetition_level_histogram.as_ref()
1116    }
1117
1118    /// Returns the definition level histogram.
1119    ///
1120    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1121    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1122    /// This field may not be set by older writers.
1123    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1124        self.definition_level_histogram.as_ref()
1125    }
1126
1127    /// Returns the encryption metadata for this column chunk.
1128    #[cfg(feature = "encryption")]
1129    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1130        self.column_crypto_metadata.as_deref()
1131    }
1132
1133    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1134    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1135        ColumnChunkMetaDataBuilder::from(self)
1136    }
1137}
1138
1139/// Builder for [`ColumnChunkMetaData`]
1140///
1141/// This builder is used to create a new column chunk metadata or modify an
1142/// existing one.
1143///
1144/// # Example
1145/// ```no_run
1146/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1147/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1148/// let column_chunk_metadata = get_column_chunk_metadata();
1149/// // create a new builder from existing column chunk metadata
1150/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1151/// // clear the statistics:
1152/// let column_chunk_metadata: ColumnChunkMetaData = builder
1153///   .clear_statistics()
1154///   .build()
1155///   .unwrap();
1156/// ```
1157pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1158
1159impl ColumnChunkMetaDataBuilder {
1160    /// Creates new column chunk metadata builder.
1161    ///
1162    /// See also [`ColumnChunkMetaData::builder`]
1163    fn new(column_descr: ColumnDescPtr) -> Self {
1164        Self(ColumnChunkMetaData {
1165            column_descr,
1166            encodings: Default::default(),
1167            file_path: None,
1168            file_offset: 0,
1169            num_values: 0,
1170            compression: Compression::UNCOMPRESSED,
1171            total_compressed_size: 0,
1172            total_uncompressed_size: 0,
1173            data_page_offset: 0,
1174            index_page_offset: None,
1175            dictionary_page_offset: None,
1176            statistics: None,
1177            geo_statistics: None,
1178            encoding_stats: None,
1179            bloom_filter_offset: None,
1180            bloom_filter_length: None,
1181            offset_index_offset: None,
1182            offset_index_length: None,
1183            column_index_offset: None,
1184            column_index_length: None,
1185            unencoded_byte_array_data_bytes: None,
1186            repetition_level_histogram: None,
1187            definition_level_histogram: None,
1188            #[cfg(feature = "encryption")]
1189            column_crypto_metadata: None,
1190            #[cfg(feature = "encryption")]
1191            encrypted_column_metadata: None,
1192        })
1193    }
1194
1195    /// Sets list of encodings for this column chunk.
1196    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1197        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1198        self
1199    }
1200
1201    /// Sets the encodings mask for this column chunk.
1202    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1203        self.0.encodings = encodings;
1204        self
1205    }
1206
1207    /// Sets optional file path for this column chunk.
1208    pub fn set_file_path(mut self, value: String) -> Self {
1209        self.0.file_path = Some(value);
1210        self
1211    }
1212
1213    /// Sets number of values.
1214    pub fn set_num_values(mut self, value: i64) -> Self {
1215        self.0.num_values = value;
1216        self
1217    }
1218
1219    /// Sets compression.
1220    pub fn set_compression(mut self, value: Compression) -> Self {
1221        self.0.compression = value;
1222        self
1223    }
1224
1225    /// Sets total compressed size in bytes.
1226    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1227        self.0.total_compressed_size = value;
1228        self
1229    }
1230
1231    /// Sets total uncompressed size in bytes.
1232    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1233        self.0.total_uncompressed_size = value;
1234        self
1235    }
1236
1237    /// Sets data page offset in bytes.
1238    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1239        self.0.data_page_offset = value;
1240        self
1241    }
1242
1243    /// Sets optional dictionary page offset in bytes.
1244    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1245        self.0.dictionary_page_offset = value;
1246        self
1247    }
1248
1249    /// Sets optional index page offset in bytes.
1250    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1251        self.0.index_page_offset = value;
1252        self
1253    }
1254
1255    /// Sets statistics for this column chunk.
1256    pub fn set_statistics(mut self, value: Statistics) -> Self {
1257        self.0.statistics = Some(value);
1258        self
1259    }
1260
1261    /// Sets geospatial statistics for this column chunk.
1262    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1263        self.0.geo_statistics = Some(value);
1264        self
1265    }
1266
1267    /// Clears the statistics for this column chunk.
1268    pub fn clear_statistics(mut self) -> Self {
1269        self.0.statistics = None;
1270        self
1271    }
1272
1273    /// Sets page encoding stats for this column chunk.
1274    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1275        self.0.encoding_stats = Some(value);
1276        self
1277    }
1278
1279    /// Clears the page encoding stats for this column chunk.
1280    pub fn clear_page_encoding_stats(mut self) -> Self {
1281        self.0.encoding_stats = None;
1282        self
1283    }
1284
1285    /// Sets optional bloom filter offset in bytes.
1286    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1287        self.0.bloom_filter_offset = value;
1288        self
1289    }
1290
1291    /// Sets optional bloom filter length in bytes.
1292    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1293        self.0.bloom_filter_length = value;
1294        self
1295    }
1296
1297    /// Sets optional offset index offset in bytes.
1298    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1299        self.0.offset_index_offset = value;
1300        self
1301    }
1302
1303    /// Sets optional offset index length in bytes.
1304    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1305        self.0.offset_index_length = value;
1306        self
1307    }
1308
1309    /// Sets optional column index offset in bytes.
1310    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1311        self.0.column_index_offset = value;
1312        self
1313    }
1314
1315    /// Sets optional column index length in bytes.
1316    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1317        self.0.column_index_length = value;
1318        self
1319    }
1320
1321    /// Sets optional length of variable length data in bytes.
1322    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1323        self.0.unencoded_byte_array_data_bytes = value;
1324        self
1325    }
1326
1327    /// Sets optional repetition level histogram
1328    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1329        self.0.repetition_level_histogram = value;
1330        self
1331    }
1332
1333    /// Sets optional repetition level histogram
1334    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1335        self.0.definition_level_histogram = value;
1336        self
1337    }
1338
1339    #[cfg(feature = "encryption")]
1340    /// Set the encryption metadata for an encrypted column
1341    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1342        self.0.column_crypto_metadata = value.map(Box::new);
1343        self
1344    }
1345
1346    #[cfg(feature = "encryption")]
1347    /// Set the encryption metadata for an encrypted column
1348    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1349        self.0.encrypted_column_metadata = value;
1350        self
1351    }
1352
1353    /// Builds column chunk metadata.
1354    pub fn build(self) -> Result<ColumnChunkMetaData> {
1355        Ok(self.0)
1356    }
1357}
1358
1359/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1360///
1361/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1362/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1363pub struct ColumnIndexBuilder {
1364    column_type: Type,
1365    null_pages: Vec<bool>,
1366    min_values: Vec<Vec<u8>>,
1367    max_values: Vec<Vec<u8>>,
1368    null_counts: Vec<i64>,
1369    boundary_order: BoundaryOrder,
1370    /// contains the concatenation of the histograms of all pages
1371    repetition_level_histograms: Option<Vec<i64>>,
1372    /// contains the concatenation of the histograms of all pages
1373    definition_level_histograms: Option<Vec<i64>>,
1374    /// Is the information in the builder valid?
1375    ///
1376    /// Set to `false` if any entry in the page doesn't have statistics for
1377    /// some reason, so statistics for that page won't be written to the file.
1378    /// This might happen if the page is entirely null, or
1379    /// is a floating point column without any non-nan values
1380    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1381    valid: bool,
1382}
1383
1384impl ColumnIndexBuilder {
1385    /// Creates a new column index builder.
1386    pub fn new(column_type: Type) -> Self {
1387        ColumnIndexBuilder {
1388            column_type,
1389            null_pages: Vec::new(),
1390            min_values: Vec::new(),
1391            max_values: Vec::new(),
1392            null_counts: Vec::new(),
1393            boundary_order: BoundaryOrder::UNORDERED,
1394            repetition_level_histograms: None,
1395            definition_level_histograms: None,
1396            valid: true,
1397        }
1398    }
1399
1400    /// Append statistics for the next page
1401    pub fn append(
1402        &mut self,
1403        null_page: bool,
1404        min_value: Vec<u8>,
1405        max_value: Vec<u8>,
1406        null_count: i64,
1407    ) {
1408        self.null_pages.push(null_page);
1409        self.min_values.push(min_value);
1410        self.max_values.push(max_value);
1411        self.null_counts.push(null_count);
1412    }
1413
1414    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1415    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1416    ///
1417    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1418    pub fn append_histograms(
1419        &mut self,
1420        repetition_level_histogram: &Option<LevelHistogram>,
1421        definition_level_histogram: &Option<LevelHistogram>,
1422    ) {
1423        if !self.valid {
1424            return;
1425        }
1426        if let Some(rep_lvl_hist) = repetition_level_histogram {
1427            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1428            hist.reserve(rep_lvl_hist.len());
1429            hist.extend(rep_lvl_hist.values());
1430        }
1431        if let Some(def_lvl_hist) = definition_level_histogram {
1432            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1433            hist.reserve(def_lvl_hist.len());
1434            hist.extend(def_lvl_hist.values());
1435        }
1436    }
1437
1438    /// Set the boundary order of the column index
1439    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1440        self.boundary_order = boundary_order;
1441    }
1442
1443    /// Mark this column index as invalid
1444    pub fn to_invalid(&mut self) {
1445        self.valid = false;
1446    }
1447
1448    /// Is the information in the builder valid?
1449    pub fn valid(&self) -> bool {
1450        self.valid
1451    }
1452
1453    /// Build and get the column index
1454    ///
1455    /// Note: callers should check [`Self::valid`] before calling this method
1456    pub fn build(self) -> Result<ColumnIndexMetaData> {
1457        Ok(match self.column_type {
1458            Type::BOOLEAN => {
1459                let index = self.build_page_index()?;
1460                ColumnIndexMetaData::BOOLEAN(index)
1461            }
1462            Type::INT32 => {
1463                let index = self.build_page_index()?;
1464                ColumnIndexMetaData::INT32(index)
1465            }
1466            Type::INT64 => {
1467                let index = self.build_page_index()?;
1468                ColumnIndexMetaData::INT64(index)
1469            }
1470            Type::INT96 => {
1471                let index = self.build_page_index()?;
1472                ColumnIndexMetaData::INT96(index)
1473            }
1474            Type::FLOAT => {
1475                let index = self.build_page_index()?;
1476                ColumnIndexMetaData::FLOAT(index)
1477            }
1478            Type::DOUBLE => {
1479                let index = self.build_page_index()?;
1480                ColumnIndexMetaData::DOUBLE(index)
1481            }
1482            Type::BYTE_ARRAY => {
1483                let index = self.build_byte_array_index()?;
1484                ColumnIndexMetaData::BYTE_ARRAY(index)
1485            }
1486            Type::FIXED_LEN_BYTE_ARRAY => {
1487                let index = self.build_byte_array_index()?;
1488                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1489            }
1490        })
1491    }
1492
1493    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1494    where
1495        T: ParquetValueType,
1496    {
1497        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1498        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1499
1500        PrimitiveColumnIndex::try_new(
1501            self.null_pages,
1502            self.boundary_order,
1503            Some(self.null_counts),
1504            self.repetition_level_histograms,
1505            self.definition_level_histograms,
1506            min_values,
1507            max_values,
1508        )
1509    }
1510
1511    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1512        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1513        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1514
1515        ByteArrayColumnIndex::try_new(
1516            self.null_pages,
1517            self.boundary_order,
1518            Some(self.null_counts),
1519            self.repetition_level_histograms,
1520            self.definition_level_histograms,
1521            min_values,
1522            max_values,
1523        )
1524    }
1525}
1526
1527impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1528    fn from(value: ColumnChunkMetaData) -> Self {
1529        ColumnChunkMetaDataBuilder(value)
1530    }
1531}
1532
1533/// Builder for offset index, part of the Parquet [PageIndex].
1534///
1535/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1536pub struct OffsetIndexBuilder {
1537    offset_array: Vec<i64>,
1538    compressed_page_size_array: Vec<i32>,
1539    first_row_index_array: Vec<i64>,
1540    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1541    current_first_row_index: i64,
1542}
1543
1544impl Default for OffsetIndexBuilder {
1545    fn default() -> Self {
1546        Self::new()
1547    }
1548}
1549
1550impl OffsetIndexBuilder {
1551    /// Creates a new offset index builder.
1552    pub fn new() -> Self {
1553        OffsetIndexBuilder {
1554            offset_array: Vec::new(),
1555            compressed_page_size_array: Vec::new(),
1556            first_row_index_array: Vec::new(),
1557            unencoded_byte_array_data_bytes_array: None,
1558            current_first_row_index: 0,
1559        }
1560    }
1561
1562    /// Append the row count of the next page.
1563    pub fn append_row_count(&mut self, row_count: i64) {
1564        let current_page_row_index = self.current_first_row_index;
1565        self.first_row_index_array.push(current_page_row_index);
1566        self.current_first_row_index += row_count;
1567    }
1568
1569    /// Append the offset and size of the next page.
1570    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1571        self.offset_array.push(offset);
1572        self.compressed_page_size_array.push(compressed_page_size);
1573    }
1574
1575    /// Append the unencoded byte array data bytes of the next page.
1576    pub fn append_unencoded_byte_array_data_bytes(
1577        &mut self,
1578        unencoded_byte_array_data_bytes: Option<i64>,
1579    ) {
1580        if let Some(val) = unencoded_byte_array_data_bytes {
1581            self.unencoded_byte_array_data_bytes_array
1582                .get_or_insert(Vec::new())
1583                .push(val);
1584        }
1585    }
1586
1587    /// Build and get the thrift metadata of offset index
1588    pub fn build(self) -> OffsetIndexMetaData {
1589        let locations = self
1590            .offset_array
1591            .iter()
1592            .zip(self.compressed_page_size_array.iter())
1593            .zip(self.first_row_index_array.iter())
1594            .map(|((offset, size), row_index)| PageLocation {
1595                offset: *offset,
1596                compressed_page_size: *size,
1597                first_row_index: *row_index,
1598            })
1599            .collect::<Vec<_>>();
1600        OffsetIndexMetaData {
1601            page_locations: locations,
1602            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1603        }
1604    }
1605}
1606
1607#[cfg(test)]
1608mod tests {
1609    use super::*;
1610    use crate::basic::{PageType, SortOrder};
1611    use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1612
1613    #[test]
1614    fn test_row_group_metadata_thrift_conversion() {
1615        let schema_descr = get_test_schema_descr();
1616
1617        let mut columns = vec![];
1618        for ptr in schema_descr.columns() {
1619            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1620            columns.push(column);
1621        }
1622        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1623            .set_num_rows(1000)
1624            .set_total_byte_size(2000)
1625            .set_column_metadata(columns)
1626            .set_ordinal(1)
1627            .build()
1628            .unwrap();
1629
1630        let mut buf = Vec::new();
1631        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1632        row_group_meta.write_thrift(&mut writer).unwrap();
1633
1634        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1635
1636        assert_eq!(row_group_res, row_group_meta);
1637    }
1638
1639    #[test]
1640    fn test_row_group_metadata_thrift_conversion_empty() {
1641        let schema_descr = get_test_schema_descr();
1642
1643        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1644
1645        assert!(row_group_meta.is_err());
1646        if let Err(e) = row_group_meta {
1647            assert_eq!(
1648                format!("{e}"),
1649                "Parquet error: Column length mismatch: 2 != 0"
1650            );
1651        }
1652    }
1653
1654    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1655    #[test]
1656    fn test_row_group_metadata_thrift_corrupted() {
1657        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1658            SchemaType::group_type_builder("schema")
1659                .with_fields(vec![
1660                    Arc::new(
1661                        SchemaType::primitive_type_builder("a", Type::INT32)
1662                            .build()
1663                            .unwrap(),
1664                    ),
1665                    Arc::new(
1666                        SchemaType::primitive_type_builder("b", Type::INT32)
1667                            .build()
1668                            .unwrap(),
1669                    ),
1670                ])
1671                .build()
1672                .unwrap(),
1673        )));
1674
1675        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1676            SchemaType::group_type_builder("schema")
1677                .with_fields(vec![
1678                    Arc::new(
1679                        SchemaType::primitive_type_builder("a", Type::INT32)
1680                            .build()
1681                            .unwrap(),
1682                    ),
1683                    Arc::new(
1684                        SchemaType::primitive_type_builder("b", Type::INT32)
1685                            .build()
1686                            .unwrap(),
1687                    ),
1688                    Arc::new(
1689                        SchemaType::primitive_type_builder("c", Type::INT32)
1690                            .build()
1691                            .unwrap(),
1692                    ),
1693                ])
1694                .build()
1695                .unwrap(),
1696        )));
1697
1698        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1699            .set_num_rows(1000)
1700            .set_total_byte_size(2000)
1701            .set_column_metadata(vec![
1702                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1703                    .build()
1704                    .unwrap(),
1705                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1706                    .build()
1707                    .unwrap(),
1708            ])
1709            .set_ordinal(1)
1710            .build()
1711            .unwrap();
1712        let mut buf = Vec::new();
1713        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1714        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1715
1716        let err = read_row_group(&mut buf, schema_descr_3cols)
1717            .unwrap_err()
1718            .to_string();
1719        assert_eq!(
1720            err,
1721            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1722        );
1723    }
1724
1725    #[test]
1726    fn test_column_chunk_metadata_thrift_conversion() {
1727        let column_descr = get_test_schema_descr().column(0);
1728        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1729            .set_encodings_mask(EncodingMask::new_from_encodings(
1730                [Encoding::PLAIN, Encoding::RLE].iter(),
1731            ))
1732            .set_file_path("file_path".to_owned())
1733            .set_num_values(1000)
1734            .set_compression(Compression::SNAPPY)
1735            .set_total_compressed_size(2000)
1736            .set_total_uncompressed_size(3000)
1737            .set_data_page_offset(4000)
1738            .set_dictionary_page_offset(Some(5000))
1739            .set_page_encoding_stats(vec![
1740                PageEncodingStats {
1741                    page_type: PageType::DATA_PAGE,
1742                    encoding: Encoding::PLAIN,
1743                    count: 3,
1744                },
1745                PageEncodingStats {
1746                    page_type: PageType::DATA_PAGE,
1747                    encoding: Encoding::RLE,
1748                    count: 5,
1749                },
1750            ])
1751            .set_bloom_filter_offset(Some(6000))
1752            .set_bloom_filter_length(Some(25))
1753            .set_offset_index_offset(Some(7000))
1754            .set_offset_index_length(Some(25))
1755            .set_column_index_offset(Some(8000))
1756            .set_column_index_length(Some(25))
1757            .set_unencoded_byte_array_data_bytes(Some(2000))
1758            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1759            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1760            .build()
1761            .unwrap();
1762
1763        let mut buf = Vec::new();
1764        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1765        col_metadata.write_thrift(&mut writer).unwrap();
1766        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1767
1768        assert_eq!(col_chunk_res, col_metadata);
1769    }
1770
1771    #[test]
1772    fn test_column_chunk_metadata_thrift_conversion_empty() {
1773        let column_descr = get_test_schema_descr().column(0);
1774
1775        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1776            .build()
1777            .unwrap();
1778
1779        let mut buf = Vec::new();
1780        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1781        col_metadata.write_thrift(&mut writer).unwrap();
1782        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1783
1784        assert_eq!(col_chunk_res, col_metadata);
1785    }
1786
1787    #[test]
1788    fn test_compressed_size() {
1789        let schema_descr = get_test_schema_descr();
1790
1791        let mut columns = vec![];
1792        for column_descr in schema_descr.columns() {
1793            let column = ColumnChunkMetaData::builder(column_descr.clone())
1794                .set_total_compressed_size(500)
1795                .set_total_uncompressed_size(700)
1796                .build()
1797                .unwrap();
1798            columns.push(column);
1799        }
1800        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1801            .set_num_rows(1000)
1802            .set_column_metadata(columns)
1803            .build()
1804            .unwrap();
1805
1806        let compressed_size_res: i64 = row_group_meta.compressed_size();
1807        let compressed_size_exp: i64 = 1000;
1808
1809        assert_eq!(compressed_size_res, compressed_size_exp);
1810    }
1811
1812    #[test]
1813    fn test_memory_size() {
1814        let schema_descr = get_test_schema_descr();
1815
1816        let columns = schema_descr
1817            .columns()
1818            .iter()
1819            .map(|column_descr| {
1820                ColumnChunkMetaData::builder(column_descr.clone())
1821                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1822                    .build()
1823            })
1824            .collect::<Result<Vec<_>>>()
1825            .unwrap();
1826        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1827            .set_num_rows(1000)
1828            .set_column_metadata(columns)
1829            .build()
1830            .unwrap();
1831        let row_group_meta = vec![row_group_meta];
1832
1833        let version = 2;
1834        let num_rows = 1000;
1835        let created_by = Some(String::from("test harness"));
1836        let key_value_metadata = Some(vec![KeyValue::new(
1837            String::from("Foo"),
1838            Some(String::from("bar")),
1839        )]);
1840        let column_orders = Some(vec![
1841            ColumnOrder::UNDEFINED,
1842            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1843        ]);
1844        let file_metadata = FileMetaData::new(
1845            version,
1846            num_rows,
1847            created_by,
1848            key_value_metadata,
1849            schema_descr.clone(),
1850            column_orders,
1851        );
1852
1853        // Now, add in Exact Statistics
1854        let columns_with_stats = schema_descr
1855            .columns()
1856            .iter()
1857            .map(|column_descr| {
1858                ColumnChunkMetaData::builder(column_descr.clone())
1859                    .set_statistics(Statistics::new::<i32>(
1860                        Some(0),
1861                        Some(100),
1862                        None,
1863                        None,
1864                        false,
1865                    ))
1866                    .build()
1867            })
1868            .collect::<Result<Vec<_>>>()
1869            .unwrap();
1870
1871        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1872            .set_num_rows(1000)
1873            .set_column_metadata(columns_with_stats)
1874            .build()
1875            .unwrap();
1876        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1877
1878        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1879            .set_row_groups(row_group_meta_with_stats)
1880            .build();
1881
1882        #[cfg(not(feature = "encryption"))]
1883        let base_expected_size = 2766;
1884        #[cfg(feature = "encryption")]
1885        let base_expected_size = 2934;
1886
1887        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1888
1889        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
1890        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1891        let column_index = column_index.build().unwrap();
1892        let native_index = match column_index {
1893            ColumnIndexMetaData::BOOLEAN(index) => index,
1894            _ => panic!("wrong type of column index"),
1895        };
1896
1897        // Now, add in OffsetIndex
1898        let mut offset_index = OffsetIndexBuilder::new();
1899        offset_index.append_row_count(1);
1900        offset_index.append_offset_and_size(2, 3);
1901        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1902        offset_index.append_row_count(1);
1903        offset_index.append_offset_and_size(2, 3);
1904        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1905        let offset_index = offset_index.build();
1906
1907        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1908            .set_row_groups(row_group_meta)
1909            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
1910            .set_offset_index(Some(vec![vec![offset_index]]))
1911            .build();
1912
1913        #[cfg(not(feature = "encryption"))]
1914        let bigger_expected_size = 3192;
1915        #[cfg(feature = "encryption")]
1916        let bigger_expected_size = 3360;
1917
1918        // more set fields means more memory usage
1919        assert!(bigger_expected_size > base_expected_size);
1920        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1921    }
1922
1923    #[test]
1924    #[cfg(feature = "encryption")]
1925    fn test_memory_size_with_decryptor() {
1926        use crate::encryption::decrypt::FileDecryptionProperties;
1927        use crate::file::metadata::thrift::encryption::AesGcmV1;
1928
1929        let schema_descr = get_test_schema_descr();
1930
1931        let columns = schema_descr
1932            .columns()
1933            .iter()
1934            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
1935            .collect::<Result<Vec<_>>>()
1936            .unwrap();
1937        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1938            .set_num_rows(1000)
1939            .set_column_metadata(columns)
1940            .build()
1941            .unwrap();
1942        let row_group_meta = vec![row_group_meta];
1943
1944        let version = 2;
1945        let num_rows = 1000;
1946        let aad_file_unique = vec![1u8; 8];
1947        let aad_prefix = vec![2u8; 8];
1948        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
1949            aad_prefix: Some(aad_prefix.clone()),
1950            aad_file_unique: Some(aad_file_unique.clone()),
1951            supply_aad_prefix: Some(true),
1952        });
1953        let footer_key_metadata = Some(vec![3u8; 8]);
1954        let file_metadata =
1955            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
1956                .with_encryption_algorithm(Some(encryption_algorithm))
1957                .with_footer_signing_key_metadata(footer_key_metadata.clone());
1958
1959        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1960            .set_row_groups(row_group_meta.clone())
1961            .build();
1962
1963        let base_expected_size = 2058;
1964        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
1965
1966        let footer_key = "0123456789012345".as_bytes();
1967        let column_key = "1234567890123450".as_bytes();
1968        let mut decryption_properties_builder =
1969            FileDecryptionProperties::builder(footer_key.to_vec())
1970                .with_aad_prefix(aad_prefix.clone());
1971        for column in schema_descr.columns() {
1972            decryption_properties_builder = decryption_properties_builder
1973                .with_column_key(&column.path().string(), column_key.to_vec());
1974        }
1975        let decryption_properties = decryption_properties_builder.build().unwrap();
1976        let decryptor = FileDecryptor::new(
1977            &decryption_properties,
1978            footer_key_metadata.as_deref(),
1979            aad_file_unique,
1980            aad_prefix,
1981        )
1982        .unwrap();
1983
1984        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
1985            .set_row_groups(row_group_meta.clone())
1986            .set_file_decryptor(Some(decryptor))
1987            .build();
1988
1989        let expected_size_with_decryptor = 3072;
1990        assert!(expected_size_with_decryptor > base_expected_size);
1991
1992        assert_eq!(
1993            parquet_meta_data.memory_size(),
1994            expected_size_with_decryptor
1995        );
1996    }
1997
1998    /// Returns sample schema descriptor so we can create column metadata.
1999    fn get_test_schema_descr() -> SchemaDescPtr {
2000        let schema = SchemaType::group_type_builder("schema")
2001            .with_fields(vec![
2002                Arc::new(
2003                    SchemaType::primitive_type_builder("a", Type::INT32)
2004                        .build()
2005                        .unwrap(),
2006                ),
2007                Arc::new(
2008                    SchemaType::primitive_type_builder("b", Type::INT32)
2009                        .build()
2010                        .unwrap(),
2011                ),
2012            ])
2013            .build()
2014            .unwrap();
2015
2016        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2017    }
2018}