parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Most users should use these structures to interact with Parquet metadata.
21//! The [crate::format] module contains lower level structures generated from the
22//! Parquet thrift definition.
23//!
24//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
25//!   file footer.
26//!
27//! * [`FileMetaData`]: File level metadata such as schema, row counts and
28//!   version.
29//!
30//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
31//!   location and number of rows, and column chunks.
32//!
33//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
34//!   within a Row Group including encoding and compression information,
35//!   number of values, statistics, etc.
36//!
37//! # APIs for working with Parquet Metadata
38//!
39//! The Parquet readers and writers in this crate handle reading and writing
40//! metadata into parquet files. To work with metadata directly,
41//! the following APIs are available:
42//!
43//! * [`ParquetMetaDataReader`] for reading from a reader for I/O
44//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
45//! * [`ParquetMetaDataWriter`] for writing.
46//!
47//!
48//! # Examples
49//!
50//! Please see [`external_metadata.rs`]
51//!
52//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
53//!
54//! # Metadata Encodings and Structures
55//!
56//! There are three different encodings of Parquet Metadata in this crate:
57//!
58//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
59//!    [parquet.thrift]
60//!
61//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
62//!    from [parquet.thrift]. These structures are low level and mirror
63//!    the thrift definitions.
64//!
65//! 3. [`file::metadata`] (this module): Easier to use Rust structures
66//!    with a more idiomatic API. Note that, confusingly, some but not all
67//!    of these structures have the same name as the [`format`] structures.
68//!
69//! [`format`]: crate::format
70//! [`file::metadata`]: crate::file::metadata
71//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
72//!
73//! Graphically, this is how the different structures relate to each other:
74//!
75//! ```text
76//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
77//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
78//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
79//!                            └──────────────┘     │         └───────────────────────┘ │
80//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
81//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
82//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
83//!                                     ...         │                   ...             │
84//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
85//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
86//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
87//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
88//!
89//!                          format::meta structures          file::metadata structures
90//!
91//!                         * Same name, different struct
92//! ```
93mod memory;
94mod push_decoder;
95pub(crate) mod reader;
96mod writer;
97
98use crate::basic::{ColumnOrder, Compression, Encoding, Type};
99#[cfg(feature = "encryption")]
100use crate::encryption::{
101    decrypt::FileDecryptor,
102    modules::{create_module_aad, ModuleType},
103};
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
107pub(crate) use crate::file::metadata::memory::HeapSize;
108use crate::file::page_encoding_stats::{self, PageEncodingStats};
109use crate::file::page_index::index::Index;
110use crate::file::page_index::offset_index::OffsetIndexMetaData;
111use crate::file::statistics::{self, Statistics};
112use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
113use crate::format::{
114    BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
115    SizeStatistics, SortingColumn,
116};
117use crate::schema::types::{
118    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
119    Type as SchemaType,
120};
121#[cfg(feature = "encryption")]
122use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
123pub use push_decoder::ParquetMetaDataPushDecoder;
124pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader};
125use std::ops::Range;
126use std::sync::Arc;
127pub use writer::ParquetMetaDataWriter;
128pub(crate) use writer::ThriftMetadataWriter;
129
130/// Page level statistics for each column chunk of each row group.
131///
132/// This structure is an in-memory representation of multiple [`ColumnIndex`]
133/// structures in a parquet file footer, as described in the Parquet [PageIndex
134/// documentation]. Each [`Index`] holds statistics about all the pages in a
135/// particular column chunk.
136///
137/// `column_index[row_group_number][column_number]` holds the
138/// [`Index`] corresponding to column `column_number` of row group
139/// `row_group_number`.
140///
141/// For example `column_index[2][3]` holds the [`Index`] for the fourth
142/// column in the third row group of the parquet file.
143///
144/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
145pub type ParquetColumnIndex = Vec<Vec<Index>>;
146
147/// [`OffsetIndexMetaData`] for each data page of each row group of each column
148///
149/// This structure is the parsed representation of the [`OffsetIndex`] from the
150/// Parquet file footer, as described in the Parquet [PageIndex documentation].
151///
152/// `offset_index[row_group_number][column_number]` holds
153/// the [`OffsetIndexMetaData`] corresponding to column
154/// `column_number`of row group `row_group_number`.
155///
156/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
157pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
158
159/// Parsed metadata for a single Parquet file
160///
161/// This structure is stored in the footer of Parquet files, in the format
162/// defined by [`parquet.thrift`].
163///
164/// # Overview
165/// The fields of this structure are:
166/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
167/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
168/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
169///
170/// This structure is read by the various readers in this crate or can be read
171/// directly from a file using the [`ParquetMetaDataReader`] struct.
172///
173/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
174///
175/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
176#[derive(Debug, Clone, PartialEq)]
177pub struct ParquetMetaData {
178    /// File level metadata
179    file_metadata: FileMetaData,
180    /// Row group metadata
181    row_groups: Vec<RowGroupMetaData>,
182    /// Page level index for each page in each column chunk
183    column_index: Option<ParquetColumnIndex>,
184    /// Offset index for each page in each column chunk
185    offset_index: Option<ParquetOffsetIndex>,
186    /// Optional file decryptor
187    #[cfg(feature = "encryption")]
188    file_decryptor: Option<FileDecryptor>,
189}
190
191impl ParquetMetaData {
192    /// Creates Parquet metadata from file metadata and a list of row
193    /// group metadata
194    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
195        ParquetMetaData {
196            file_metadata,
197            row_groups,
198            #[cfg(feature = "encryption")]
199            file_decryptor: None,
200            column_index: None,
201            offset_index: None,
202        }
203    }
204
205    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
206    /// encrypted data.
207    #[cfg(feature = "encryption")]
208    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
209        self.file_decryptor = file_decryptor;
210    }
211
212    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
213    pub fn into_builder(self) -> ParquetMetaDataBuilder {
214        self.into()
215    }
216
217    /// Returns file metadata as reference.
218    pub fn file_metadata(&self) -> &FileMetaData {
219        &self.file_metadata
220    }
221
222    /// Returns file decryptor as reference.
223    #[cfg(feature = "encryption")]
224    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
225        self.file_decryptor.as_ref()
226    }
227
228    /// Returns number of row groups in this file.
229    pub fn num_row_groups(&self) -> usize {
230        self.row_groups.len()
231    }
232
233    /// Returns row group metadata for `i`th position.
234    /// Position should be less than number of row groups `num_row_groups`.
235    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
236        &self.row_groups[i]
237    }
238
239    /// Returns slice of row groups in this file.
240    pub fn row_groups(&self) -> &[RowGroupMetaData] {
241        &self.row_groups
242    }
243
244    /// Returns the column index for this file if loaded
245    ///
246    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
247    /// [ArrowReaderOptions::with_page_index] was set to false.
248    ///
249    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
250    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
251        self.column_index.as_ref()
252    }
253
254    /// Returns offset indexes in this file, if loaded
255    ///
256    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
257    /// [ArrowReaderOptions::with_page_index] was set to false.
258    ///
259    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
260    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
261        self.offset_index.as_ref()
262    }
263
264    /// Estimate of the bytes allocated to store `ParquetMetadata`
265    ///
266    /// # Notes:
267    ///
268    /// 1. Includes size of self
269    ///
270    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
271    ///    [`RowGroupMetaData`].
272    ///
273    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
274    ///    means `memory_size` will over estimate the memory size if such pointers
275    ///    are shared.
276    ///
277    /// 4. Does not include any allocator overheads
278    pub fn memory_size(&self) -> usize {
279        std::mem::size_of::<Self>()
280            + self.file_metadata.heap_size()
281            + self.row_groups.heap_size()
282            + self.column_index.heap_size()
283            + self.offset_index.heap_size()
284    }
285
286    /// Override the column index
287    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
288        self.column_index = index;
289    }
290
291    /// Override the offset index
292    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
293        self.offset_index = index;
294    }
295}
296
297/// A builder for creating / manipulating [`ParquetMetaData`]
298///
299/// # Example creating a new [`ParquetMetaData`]
300///
301///```no_run
302/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
303/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
304/// // Create a new builder given the file metadata
305/// let file_metadata = get_file_metadata();
306/// // Create a row group
307/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
308///    .set_num_rows(100)
309///    // ... (A real row group needs more than just the number of rows)
310///    .build()
311///    .unwrap();
312/// // Create the final metadata
313/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
314///   .add_row_group(row_group)
315///   .build();
316/// ```
317///
318/// # Example modifying an existing [`ParquetMetaData`]
319/// ```no_run
320/// # use parquet::file::metadata::ParquetMetaData;
321/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
322/// // Modify the metadata so only the last RowGroup remains
323/// let metadata: ParquetMetaData = load_metadata();
324/// let mut builder = metadata.into_builder();
325///
326/// // Take existing row groups to modify
327/// let mut row_groups = builder.take_row_groups();
328/// let last_row_group = row_groups.pop().unwrap();
329///
330/// let metadata = builder
331///   .add_row_group(last_row_group)
332///   .build();
333/// ```
334pub struct ParquetMetaDataBuilder(ParquetMetaData);
335
336impl ParquetMetaDataBuilder {
337    /// Create a new builder from a file metadata, with no row groups
338    pub fn new(file_meta_data: FileMetaData) -> Self {
339        Self(ParquetMetaData::new(file_meta_data, vec![]))
340    }
341
342    /// Create a new builder from an existing ParquetMetaData
343    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
344        Self(metadata)
345    }
346
347    /// Adds a row group to the metadata
348    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
349        self.0.row_groups.push(row_group);
350        self
351    }
352
353    /// Sets all the row groups to the specified list
354    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
355        self.0.row_groups = row_groups;
356        self
357    }
358
359    /// Takes ownership of the row groups in this builder, and clears the list
360    /// of row groups.
361    ///
362    /// This can be used for more efficient creation of a new ParquetMetaData
363    /// from an existing one.
364    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
365        std::mem::take(&mut self.0.row_groups)
366    }
367
368    /// Return a reference to the current row groups
369    pub fn row_groups(&self) -> &[RowGroupMetaData] {
370        &self.0.row_groups
371    }
372
373    /// Sets the column index
374    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
375        self.0.column_index = column_index;
376        self
377    }
378
379    /// Returns the current column index from the builder, replacing it with `None`
380    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
381        std::mem::take(&mut self.0.column_index)
382    }
383
384    /// Return a reference to the current column index, if any
385    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
386        self.0.column_index.as_ref()
387    }
388
389    /// Sets the offset index
390    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
391        self.0.offset_index = offset_index;
392        self
393    }
394
395    /// Returns the current offset index from the builder, replacing it with `None`
396    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
397        std::mem::take(&mut self.0.offset_index)
398    }
399
400    /// Return a reference to the current offset index, if any
401    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
402        self.0.offset_index.as_ref()
403    }
404
405    /// Creates a new ParquetMetaData from the builder
406    pub fn build(self) -> ParquetMetaData {
407        let Self(metadata) = self;
408        metadata
409    }
410}
411
412impl From<ParquetMetaData> for ParquetMetaDataBuilder {
413    fn from(meta_data: ParquetMetaData) -> Self {
414        Self(meta_data)
415    }
416}
417
418/// A key-value pair for [`FileMetaData`].
419pub type KeyValue = crate::format::KeyValue;
420
421/// Reference counted pointer for [`FileMetaData`].
422pub type FileMetaDataPtr = Arc<FileMetaData>;
423
424/// File level metadata for a Parquet file.
425///
426/// Includes the version of the file, metadata, number of rows, schema, and column orders
427#[derive(Debug, Clone, PartialEq)]
428pub struct FileMetaData {
429    version: i32,
430    num_rows: i64,
431    created_by: Option<String>,
432    key_value_metadata: Option<Vec<KeyValue>>,
433    schema_descr: SchemaDescPtr,
434    column_orders: Option<Vec<ColumnOrder>>,
435}
436
437impl FileMetaData {
438    /// Creates new file metadata.
439    pub fn new(
440        version: i32,
441        num_rows: i64,
442        created_by: Option<String>,
443        key_value_metadata: Option<Vec<KeyValue>>,
444        schema_descr: SchemaDescPtr,
445        column_orders: Option<Vec<ColumnOrder>>,
446    ) -> Self {
447        FileMetaData {
448            version,
449            num_rows,
450            created_by,
451            key_value_metadata,
452            schema_descr,
453            column_orders,
454        }
455    }
456
457    /// Returns version of this file.
458    pub fn version(&self) -> i32 {
459        self.version
460    }
461
462    /// Returns number of rows in the file.
463    pub fn num_rows(&self) -> i64 {
464        self.num_rows
465    }
466
467    /// String message for application that wrote this file.
468    ///
469    /// This should have the following format:
470    /// `<application> version <application version> (build <application build hash>)`.
471    ///
472    /// ```shell
473    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
474    /// ```
475    pub fn created_by(&self) -> Option<&str> {
476        self.created_by.as_deref()
477    }
478
479    /// Returns key_value_metadata of this file.
480    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
481        self.key_value_metadata.as_ref()
482    }
483
484    /// Returns Parquet [`Type`] that describes schema in this file.
485    ///
486    /// [`Type`]: crate::schema::types::Type
487    pub fn schema(&self) -> &SchemaType {
488        self.schema_descr.root_schema()
489    }
490
491    /// Returns a reference to schema descriptor.
492    pub fn schema_descr(&self) -> &SchemaDescriptor {
493        &self.schema_descr
494    }
495
496    /// Returns reference counted clone for schema descriptor.
497    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
498        self.schema_descr.clone()
499    }
500
501    /// Column (sort) order used for `min` and `max` values of each column in this file.
502    ///
503    /// Each column order corresponds to one column, determined by its position in the
504    /// list, matching the position of the column in the schema.
505    ///
506    /// When `None` is returned, there are no column orders available, and each column
507    /// should be assumed to have undefined (legacy) column order.
508    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
509        self.column_orders.as_ref()
510    }
511
512    /// Returns column order for `i`th column in this file.
513    /// If column orders are not available, returns undefined (legacy) column order.
514    pub fn column_order(&self, i: usize) -> ColumnOrder {
515        self.column_orders
516            .as_ref()
517            .map(|data| data[i])
518            .unwrap_or(ColumnOrder::UNDEFINED)
519    }
520}
521
522/// Reference counted pointer for [`RowGroupMetaData`].
523pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
524
525/// Metadata for a row group
526///
527/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
528/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
529#[derive(Debug, Clone, PartialEq)]
530pub struct RowGroupMetaData {
531    columns: Vec<ColumnChunkMetaData>,
532    num_rows: i64,
533    sorting_columns: Option<Vec<SortingColumn>>,
534    total_byte_size: i64,
535    schema_descr: SchemaDescPtr,
536    /// We can't infer from file offset of first column since there may empty columns in row group.
537    file_offset: Option<i64>,
538    /// Ordinal position of this row group in file
539    ordinal: Option<i16>,
540}
541
542impl RowGroupMetaData {
543    /// Returns builder for row group metadata.
544    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
545        RowGroupMetaDataBuilder::new(schema_descr)
546    }
547
548    /// Number of columns in this row group.
549    pub fn num_columns(&self) -> usize {
550        self.columns.len()
551    }
552
553    /// Returns column chunk metadata for `i`th column.
554    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
555        &self.columns[i]
556    }
557
558    /// Returns slice of column chunk metadata.
559    pub fn columns(&self) -> &[ColumnChunkMetaData] {
560        &self.columns
561    }
562
563    /// Returns mutable slice of column chunk metadata.
564    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
565        &mut self.columns
566    }
567
568    /// Number of rows in this row group.
569    pub fn num_rows(&self) -> i64 {
570        self.num_rows
571    }
572
573    /// Returns the sort ordering of the rows in this RowGroup if any
574    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
575        self.sorting_columns.as_ref()
576    }
577
578    /// Total byte size of all uncompressed column data in this row group.
579    pub fn total_byte_size(&self) -> i64 {
580        self.total_byte_size
581    }
582
583    /// Total size of all compressed column data in this row group.
584    pub fn compressed_size(&self) -> i64 {
585        self.columns.iter().map(|c| c.total_compressed_size).sum()
586    }
587
588    /// Returns reference to a schema descriptor.
589    pub fn schema_descr(&self) -> &SchemaDescriptor {
590        self.schema_descr.as_ref()
591    }
592
593    /// Returns reference counted clone of schema descriptor.
594    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
595        self.schema_descr.clone()
596    }
597
598    /// Returns ordinal position of this row group in file.
599    ///
600    /// For example if this is the first row group in the file, this will return 0.
601    /// If this is the second row group in the file, this will return 1.
602    #[inline(always)]
603    pub fn ordinal(&self) -> Option<i16> {
604        self.ordinal
605    }
606
607    /// Returns file offset of this row group in file.
608    #[inline(always)]
609    pub fn file_offset(&self) -> Option<i64> {
610        self.file_offset
611    }
612
613    /// Method to convert from encrypted Thrift.
614    #[cfg(feature = "encryption")]
615    fn from_encrypted_thrift(
616        schema_descr: SchemaDescPtr,
617        mut rg: RowGroup,
618        decryptor: Option<&FileDecryptor>,
619    ) -> Result<RowGroupMetaData> {
620        if schema_descr.num_columns() != rg.columns.len() {
621            return Err(general_err!(
622                "Column count mismatch. Schema has {} columns while Row Group has {}",
623                schema_descr.num_columns(),
624                rg.columns.len()
625            ));
626        }
627        let total_byte_size = rg.total_byte_size;
628        let num_rows = rg.num_rows;
629        let mut columns = vec![];
630
631        for (i, (mut c, d)) in rg
632            .columns
633            .drain(0..)
634            .zip(schema_descr.columns())
635            .enumerate()
636        {
637            // Read encrypted metadata if it's present and we have a decryptor.
638            if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
639                let column_decryptor = match c.crypto_metadata.as_ref() {
640                    None => {
641                        return Err(general_err!(
642                            "No crypto_metadata is set for column '{}', which has encrypted metadata",
643                            d.path().string()
644                        ));
645                    }
646                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
647                        let column_name = crypto_metadata.path_in_schema.join(".");
648                        decryptor.get_column_metadata_decryptor(
649                            column_name.as_str(),
650                            crypto_metadata.key_metadata.as_deref(),
651                        )?
652                    }
653                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
654                        decryptor.get_footer_decryptor()?
655                    }
656                };
657
658                let column_aad = create_module_aad(
659                    decryptor.file_aad(),
660                    ModuleType::ColumnMetaData,
661                    rg.ordinal.unwrap() as usize,
662                    i,
663                    None,
664                )?;
665
666                let buf = c.encrypted_column_metadata.clone().unwrap();
667                let decrypted_cc_buf = column_decryptor
668                    .decrypt(buf.as_slice(), column_aad.as_ref())
669                    .map_err(|_| {
670                        general_err!(
671                            "Unable to decrypt column '{}', perhaps the column key is wrong?",
672                            d.path().string()
673                        )
674                    })?;
675
676                let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
677                c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
678            }
679            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
680        }
681
682        let sorting_columns = rg.sorting_columns;
683        Ok(RowGroupMetaData {
684            columns,
685            num_rows,
686            sorting_columns,
687            total_byte_size,
688            schema_descr,
689            file_offset: rg.file_offset,
690            ordinal: rg.ordinal,
691        })
692    }
693
694    /// Method to convert from Thrift.
695    pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
696        if schema_descr.num_columns() != rg.columns.len() {
697            return Err(general_err!(
698                "Column count mismatch. Schema has {} columns while Row Group has {}",
699                schema_descr.num_columns(),
700                rg.columns.len()
701            ));
702        }
703        let total_byte_size = rg.total_byte_size;
704        let num_rows = rg.num_rows;
705        let mut columns = vec![];
706
707        for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
708            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
709        }
710
711        let sorting_columns = rg.sorting_columns;
712        Ok(RowGroupMetaData {
713            columns,
714            num_rows,
715            sorting_columns,
716            total_byte_size,
717            schema_descr,
718            file_offset: rg.file_offset,
719            ordinal: rg.ordinal,
720        })
721    }
722
723    /// Method to convert to Thrift.
724    pub fn to_thrift(&self) -> RowGroup {
725        RowGroup {
726            columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
727            total_byte_size: self.total_byte_size,
728            num_rows: self.num_rows,
729            sorting_columns: self.sorting_columns().cloned(),
730            file_offset: self.file_offset(),
731            total_compressed_size: Some(self.compressed_size()),
732            ordinal: self.ordinal,
733        }
734    }
735
736    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
737    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
738        RowGroupMetaDataBuilder(self)
739    }
740}
741
742/// Builder for row group metadata.
743pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
744
745impl RowGroupMetaDataBuilder {
746    /// Creates new builder from schema descriptor.
747    fn new(schema_descr: SchemaDescPtr) -> Self {
748        Self(RowGroupMetaData {
749            columns: Vec::with_capacity(schema_descr.num_columns()),
750            schema_descr,
751            file_offset: None,
752            num_rows: 0,
753            sorting_columns: None,
754            total_byte_size: 0,
755            ordinal: None,
756        })
757    }
758
759    /// Sets number of rows in this row group.
760    pub fn set_num_rows(mut self, value: i64) -> Self {
761        self.0.num_rows = value;
762        self
763    }
764
765    /// Sets the sorting order for columns
766    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
767        self.0.sorting_columns = value;
768        self
769    }
770
771    /// Sets total size in bytes for this row group.
772    pub fn set_total_byte_size(mut self, value: i64) -> Self {
773        self.0.total_byte_size = value;
774        self
775    }
776
777    /// Takes ownership of the the column metadata in this builder, and clears
778    /// the list of columns.
779    ///
780    /// This can be used for more efficient creation of a new RowGroupMetaData
781    /// from an existing one.
782    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
783        std::mem::take(&mut self.0.columns)
784    }
785
786    /// Sets column metadata for this row group.
787    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
788        self.0.columns = value;
789        self
790    }
791
792    /// Adds a column metadata to this row group
793    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
794        self.0.columns.push(value);
795        self
796    }
797
798    /// Sets ordinal for this row group.
799    pub fn set_ordinal(mut self, value: i16) -> Self {
800        self.0.ordinal = Some(value);
801        self
802    }
803
804    /// Sets file offset for this row group.
805    pub fn set_file_offset(mut self, value: i64) -> Self {
806        self.0.file_offset = Some(value);
807        self
808    }
809
810    /// Builds row group metadata.
811    pub fn build(self) -> Result<RowGroupMetaData> {
812        if self.0.schema_descr.num_columns() != self.0.columns.len() {
813            return Err(general_err!(
814                "Column length mismatch: {} != {}",
815                self.0.schema_descr.num_columns(),
816                self.0.columns.len()
817            ));
818        }
819
820        Ok(self.0)
821    }
822}
823
824/// Metadata for a column chunk.
825#[derive(Debug, Clone, PartialEq)]
826pub struct ColumnChunkMetaData {
827    column_descr: ColumnDescPtr,
828    encodings: Vec<Encoding>,
829    file_path: Option<String>,
830    file_offset: i64,
831    num_values: i64,
832    compression: Compression,
833    total_compressed_size: i64,
834    total_uncompressed_size: i64,
835    data_page_offset: i64,
836    index_page_offset: Option<i64>,
837    dictionary_page_offset: Option<i64>,
838    statistics: Option<Statistics>,
839    encoding_stats: Option<Vec<PageEncodingStats>>,
840    bloom_filter_offset: Option<i64>,
841    bloom_filter_length: Option<i32>,
842    offset_index_offset: Option<i64>,
843    offset_index_length: Option<i32>,
844    column_index_offset: Option<i64>,
845    column_index_length: Option<i32>,
846    unencoded_byte_array_data_bytes: Option<i64>,
847    repetition_level_histogram: Option<LevelHistogram>,
848    definition_level_histogram: Option<LevelHistogram>,
849    #[cfg(feature = "encryption")]
850    column_crypto_metadata: Option<ColumnCryptoMetaData>,
851}
852
853/// Histograms for repetition and definition levels.
854///
855/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
856/// values at level `i`.
857///
858/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
859/// number of rows with level 1, and so on.
860///
861#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
862pub struct LevelHistogram {
863    inner: Vec<i64>,
864}
865
866impl LevelHistogram {
867    /// Creates a new level histogram data.
868    ///
869    /// Length will be `max_level + 1`.
870    ///
871    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
872    pub fn try_new(max_level: i16) -> Option<Self> {
873        if max_level > 0 {
874            Some(Self {
875                inner: vec![0; max_level as usize + 1],
876            })
877        } else {
878            None
879        }
880    }
881    /// Returns a reference to the the histogram's values.
882    pub fn values(&self) -> &[i64] {
883        &self.inner
884    }
885
886    /// Return the inner vector, consuming self
887    pub fn into_inner(self) -> Vec<i64> {
888        self.inner
889    }
890
891    /// Returns the histogram value at the given index.
892    ///
893    /// The value of `i` is the number of values with level `i`. For example,
894    /// `get(1)` returns the number of values with level 1.
895    ///
896    /// Returns `None` if the index is out of bounds.
897    pub fn get(&self, index: usize) -> Option<i64> {
898        self.inner.get(index).copied()
899    }
900
901    /// Adds the values from the other histogram to this histogram
902    ///
903    /// # Panics
904    /// If the histograms have different lengths
905    pub fn add(&mut self, other: &Self) {
906        assert_eq!(self.len(), other.len());
907        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
908            *dst += src;
909        }
910    }
911
912    /// return the length of the histogram
913    pub fn len(&self) -> usize {
914        self.inner.len()
915    }
916
917    /// returns if the histogram is empty
918    pub fn is_empty(&self) -> bool {
919        self.inner.is_empty()
920    }
921
922    /// Sets the values of all histogram levels to 0.
923    pub fn reset(&mut self) {
924        for value in self.inner.iter_mut() {
925            *value = 0;
926        }
927    }
928
929    /// Updates histogram values using provided repetition levels
930    ///
931    /// # Panics
932    /// if any of the levels is greater than the length of the histogram (
933    /// the argument supplied to [`Self::try_new`])
934    pub fn update_from_levels(&mut self, levels: &[i16]) {
935        for &level in levels {
936            self.inner[level as usize] += 1;
937        }
938    }
939}
940
941impl From<Vec<i64>> for LevelHistogram {
942    fn from(inner: Vec<i64>) -> Self {
943        Self { inner }
944    }
945}
946
947impl From<LevelHistogram> for Vec<i64> {
948    fn from(value: LevelHistogram) -> Self {
949        value.into_inner()
950    }
951}
952
953impl HeapSize for LevelHistogram {
954    fn heap_size(&self) -> usize {
955        self.inner.heap_size()
956    }
957}
958
959/// Represents common operations for a column chunk.
960impl ColumnChunkMetaData {
961    /// Returns builder for column chunk metadata.
962    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
963        ColumnChunkMetaDataBuilder::new(column_descr)
964    }
965
966    /// File where the column chunk is stored.
967    ///
968    /// If not set, assumed to belong to the same file as the metadata.
969    /// This path is relative to the current file.
970    pub fn file_path(&self) -> Option<&str> {
971        self.file_path.as_deref()
972    }
973
974    /// Byte offset of `ColumnMetaData` in `file_path()`.
975    ///
976    /// Note that the meaning of this field has been inconsistent between implementations
977    /// so its use has since been deprecated in the Parquet specification. Modern implementations
978    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
979    /// `ColumnChunk` struct.
980    pub fn file_offset(&self) -> i64 {
981        self.file_offset
982    }
983
984    /// Type of this column. Must be primitive.
985    pub fn column_type(&self) -> Type {
986        self.column_descr.physical_type()
987    }
988
989    /// Path (or identifier) of this column.
990    pub fn column_path(&self) -> &ColumnPath {
991        self.column_descr.path()
992    }
993
994    /// Descriptor for this column.
995    pub fn column_descr(&self) -> &ColumnDescriptor {
996        self.column_descr.as_ref()
997    }
998
999    /// Reference counted clone of descriptor for this column.
1000    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1001        self.column_descr.clone()
1002    }
1003
1004    /// All encodings used for this column.
1005    pub fn encodings(&self) -> &Vec<Encoding> {
1006        &self.encodings
1007    }
1008
1009    /// Total number of values in this column chunk.
1010    pub fn num_values(&self) -> i64 {
1011        self.num_values
1012    }
1013
1014    /// Compression for this column.
1015    pub fn compression(&self) -> Compression {
1016        self.compression
1017    }
1018
1019    /// Returns the total compressed data size of this column chunk.
1020    pub fn compressed_size(&self) -> i64 {
1021        self.total_compressed_size
1022    }
1023
1024    /// Returns the total uncompressed data size of this column chunk.
1025    pub fn uncompressed_size(&self) -> i64 {
1026        self.total_uncompressed_size
1027    }
1028
1029    /// Returns the offset for the column data.
1030    pub fn data_page_offset(&self) -> i64 {
1031        self.data_page_offset
1032    }
1033
1034    /// Returns the offset for the index page.
1035    pub fn index_page_offset(&self) -> Option<i64> {
1036        self.index_page_offset
1037    }
1038
1039    /// Returns the offset for the dictionary page, if any.
1040    pub fn dictionary_page_offset(&self) -> Option<i64> {
1041        self.dictionary_page_offset
1042    }
1043
1044    /// Returns the offset and length in bytes of the column chunk within the file
1045    pub fn byte_range(&self) -> (u64, u64) {
1046        let col_start = match self.dictionary_page_offset() {
1047            Some(dictionary_page_offset) => dictionary_page_offset,
1048            None => self.data_page_offset(),
1049        };
1050        let col_len = self.compressed_size();
1051        assert!(
1052            col_start >= 0 && col_len >= 0,
1053            "column start and length should not be negative"
1054        );
1055        (col_start as u64, col_len as u64)
1056    }
1057
1058    /// Returns statistics that are set for this column chunk,
1059    /// or `None` if no statistics are available.
1060    pub fn statistics(&self) -> Option<&Statistics> {
1061        self.statistics.as_ref()
1062    }
1063
1064    /// Returns the offset for the page encoding stats,
1065    /// or `None` if no page encoding stats are available.
1066    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1067        self.encoding_stats.as_ref()
1068    }
1069
1070    /// Returns the offset for the bloom filter.
1071    pub fn bloom_filter_offset(&self) -> Option<i64> {
1072        self.bloom_filter_offset
1073    }
1074
1075    /// Returns the offset for the bloom filter.
1076    pub fn bloom_filter_length(&self) -> Option<i32> {
1077        self.bloom_filter_length
1078    }
1079
1080    /// Returns the offset for the column index.
1081    pub fn column_index_offset(&self) -> Option<i64> {
1082        self.column_index_offset
1083    }
1084
1085    /// Returns the offset for the column index length.
1086    pub fn column_index_length(&self) -> Option<i32> {
1087        self.column_index_length
1088    }
1089
1090    /// Returns the range for the offset index if any
1091    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1092        let offset = u64::try_from(self.column_index_offset?).ok()?;
1093        let length = u64::try_from(self.column_index_length?).ok()?;
1094        Some(offset..(offset + length))
1095    }
1096
1097    /// Returns the offset for the offset index.
1098    pub fn offset_index_offset(&self) -> Option<i64> {
1099        self.offset_index_offset
1100    }
1101
1102    /// Returns the offset for the offset index length.
1103    pub fn offset_index_length(&self) -> Option<i32> {
1104        self.offset_index_length
1105    }
1106
1107    /// Returns the range for the offset index if any
1108    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1109        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1110        let length = u64::try_from(self.offset_index_length?).ok()?;
1111        Some(offset..(offset + length))
1112    }
1113
1114    /// Returns the number of bytes of variable length data after decoding.
1115    ///
1116    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1117    /// writers.
1118    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1119        self.unencoded_byte_array_data_bytes
1120    }
1121
1122    /// Returns the repetition level histogram.
1123    ///
1124    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1125    /// `vec[0]` indicates how many rows the page contains.
1126    /// This field may not be set by older writers.
1127    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1128        self.repetition_level_histogram.as_ref()
1129    }
1130
1131    /// Returns the definition level histogram.
1132    ///
1133    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1134    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1135    /// This field may not be set by older writers.
1136    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1137        self.definition_level_histogram.as_ref()
1138    }
1139
1140    /// Returns the encryption metadata for this column chunk.
1141    #[cfg(feature = "encryption")]
1142    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1143        self.column_crypto_metadata.as_ref()
1144    }
1145
1146    /// Method to convert from Thrift.
1147    pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1148        if cc.meta_data.is_none() {
1149            return Err(general_err!("Expected to have column metadata"));
1150        }
1151        let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1152        let column_type = Type::try_from(col_metadata.type_)?;
1153        let encodings = col_metadata
1154            .encodings
1155            .drain(0..)
1156            .map(Encoding::try_from)
1157            .collect::<Result<_>>()?;
1158        let compression = Compression::try_from(col_metadata.codec)?;
1159        let file_path = cc.file_path;
1160        let file_offset = cc.file_offset;
1161        let num_values = col_metadata.num_values;
1162        let total_compressed_size = col_metadata.total_compressed_size;
1163        let total_uncompressed_size = col_metadata.total_uncompressed_size;
1164        let data_page_offset = col_metadata.data_page_offset;
1165        let index_page_offset = col_metadata.index_page_offset;
1166        let dictionary_page_offset = col_metadata.dictionary_page_offset;
1167        let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1168        let encoding_stats = col_metadata
1169            .encoding_stats
1170            .as_ref()
1171            .map(|vec| {
1172                vec.iter()
1173                    .map(page_encoding_stats::try_from_thrift)
1174                    .collect::<Result<_>>()
1175            })
1176            .transpose()?;
1177        let bloom_filter_offset = col_metadata.bloom_filter_offset;
1178        let bloom_filter_length = col_metadata.bloom_filter_length;
1179        let offset_index_offset = cc.offset_index_offset;
1180        let offset_index_length = cc.offset_index_length;
1181        let column_index_offset = cc.column_index_offset;
1182        let column_index_length = cc.column_index_length;
1183        let (
1184            unencoded_byte_array_data_bytes,
1185            repetition_level_histogram,
1186            definition_level_histogram,
1187        ) = if let Some(size_stats) = col_metadata.size_statistics {
1188            (
1189                size_stats.unencoded_byte_array_data_bytes,
1190                size_stats.repetition_level_histogram,
1191                size_stats.definition_level_histogram,
1192            )
1193        } else {
1194            (None, None, None)
1195        };
1196
1197        let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1198        let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1199
1200        #[cfg(feature = "encryption")]
1201        let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1202            Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1203        } else {
1204            None
1205        };
1206
1207        let result = ColumnChunkMetaData {
1208            column_descr,
1209            encodings,
1210            file_path,
1211            file_offset,
1212            num_values,
1213            compression,
1214            total_compressed_size,
1215            total_uncompressed_size,
1216            data_page_offset,
1217            index_page_offset,
1218            dictionary_page_offset,
1219            statistics,
1220            encoding_stats,
1221            bloom_filter_offset,
1222            bloom_filter_length,
1223            offset_index_offset,
1224            offset_index_length,
1225            column_index_offset,
1226            column_index_length,
1227            unencoded_byte_array_data_bytes,
1228            repetition_level_histogram,
1229            definition_level_histogram,
1230            #[cfg(feature = "encryption")]
1231            column_crypto_metadata,
1232        };
1233        Ok(result)
1234    }
1235
1236    /// Method to convert to Thrift.
1237    pub fn to_thrift(&self) -> ColumnChunk {
1238        let column_metadata = self.to_column_metadata_thrift();
1239
1240        ColumnChunk {
1241            file_path: self.file_path().map(|s| s.to_owned()),
1242            file_offset: self.file_offset,
1243            meta_data: Some(column_metadata),
1244            offset_index_offset: self.offset_index_offset,
1245            offset_index_length: self.offset_index_length,
1246            column_index_offset: self.column_index_offset,
1247            column_index_length: self.column_index_length,
1248            crypto_metadata: self.column_crypto_metadata_thrift(),
1249            encrypted_column_metadata: None,
1250        }
1251    }
1252
1253    /// Method to convert to Thrift `ColumnMetaData`
1254    pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1255        let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1256            || self.repetition_level_histogram.is_some()
1257            || self.definition_level_histogram.is_some()
1258        {
1259            let repetition_level_histogram = self
1260                .repetition_level_histogram
1261                .as_ref()
1262                .map(|hist| hist.clone().into_inner());
1263
1264            let definition_level_histogram = self
1265                .definition_level_histogram
1266                .as_ref()
1267                .map(|hist| hist.clone().into_inner());
1268
1269            Some(SizeStatistics {
1270                unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1271                repetition_level_histogram,
1272                definition_level_histogram,
1273            })
1274        } else {
1275            None
1276        };
1277
1278        ColumnMetaData {
1279            type_: self.column_type().into(),
1280            encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1281            path_in_schema: self.column_path().as_ref().to_vec(),
1282            codec: self.compression.into(),
1283            num_values: self.num_values,
1284            total_uncompressed_size: self.total_uncompressed_size,
1285            total_compressed_size: self.total_compressed_size,
1286            key_value_metadata: None,
1287            data_page_offset: self.data_page_offset,
1288            index_page_offset: self.index_page_offset,
1289            dictionary_page_offset: self.dictionary_page_offset,
1290            statistics: statistics::to_thrift(self.statistics.as_ref()),
1291            encoding_stats: self
1292                .encoding_stats
1293                .as_ref()
1294                .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1295            bloom_filter_offset: self.bloom_filter_offset,
1296            bloom_filter_length: self.bloom_filter_length,
1297            size_statistics,
1298            geospatial_statistics: None,
1299        }
1300    }
1301
1302    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1303    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1304        ColumnChunkMetaDataBuilder::from(self)
1305    }
1306
1307    #[cfg(feature = "encryption")]
1308    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1309        self.column_crypto_metadata
1310            .as_ref()
1311            .map(column_crypto_metadata::to_thrift)
1312    }
1313
1314    #[cfg(not(feature = "encryption"))]
1315    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1316        None
1317    }
1318}
1319
1320/// Builder for [`ColumnChunkMetaData`]
1321///
1322/// This builder is used to create a new column chunk metadata or modify an
1323/// existing one.
1324///
1325/// # Example
1326/// ```no_run
1327/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1328/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1329/// let column_chunk_metadata = get_column_chunk_metadata();
1330/// // create a new builder from existing column chunk metadata
1331/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1332/// // clear the statistics:
1333/// let column_chunk_metadata: ColumnChunkMetaData = builder
1334///   .clear_statistics()
1335///   .build()
1336///   .unwrap();
1337/// ```
1338pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1339
1340impl ColumnChunkMetaDataBuilder {
1341    /// Creates new column chunk metadata builder.
1342    ///
1343    /// See also [`ColumnChunkMetaData::builder`]
1344    fn new(column_descr: ColumnDescPtr) -> Self {
1345        Self(ColumnChunkMetaData {
1346            column_descr,
1347            encodings: Vec::new(),
1348            file_path: None,
1349            file_offset: 0,
1350            num_values: 0,
1351            compression: Compression::UNCOMPRESSED,
1352            total_compressed_size: 0,
1353            total_uncompressed_size: 0,
1354            data_page_offset: 0,
1355            index_page_offset: None,
1356            dictionary_page_offset: None,
1357            statistics: None,
1358            encoding_stats: None,
1359            bloom_filter_offset: None,
1360            bloom_filter_length: None,
1361            offset_index_offset: None,
1362            offset_index_length: None,
1363            column_index_offset: None,
1364            column_index_length: None,
1365            unencoded_byte_array_data_bytes: None,
1366            repetition_level_histogram: None,
1367            definition_level_histogram: None,
1368            #[cfg(feature = "encryption")]
1369            column_crypto_metadata: None,
1370        })
1371    }
1372
1373    /// Sets list of encodings for this column chunk.
1374    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1375        self.0.encodings = encodings;
1376        self
1377    }
1378
1379    /// Sets optional file path for this column chunk.
1380    pub fn set_file_path(mut self, value: String) -> Self {
1381        self.0.file_path = Some(value);
1382        self
1383    }
1384
1385    /// Sets number of values.
1386    pub fn set_num_values(mut self, value: i64) -> Self {
1387        self.0.num_values = value;
1388        self
1389    }
1390
1391    /// Sets compression.
1392    pub fn set_compression(mut self, value: Compression) -> Self {
1393        self.0.compression = value;
1394        self
1395    }
1396
1397    /// Sets total compressed size in bytes.
1398    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1399        self.0.total_compressed_size = value;
1400        self
1401    }
1402
1403    /// Sets total uncompressed size in bytes.
1404    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1405        self.0.total_uncompressed_size = value;
1406        self
1407    }
1408
1409    /// Sets data page offset in bytes.
1410    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1411        self.0.data_page_offset = value;
1412        self
1413    }
1414
1415    /// Sets optional dictionary page offset in bytes.
1416    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1417        self.0.dictionary_page_offset = value;
1418        self
1419    }
1420
1421    /// Sets optional index page offset in bytes.
1422    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1423        self.0.index_page_offset = value;
1424        self
1425    }
1426
1427    /// Sets statistics for this column chunk.
1428    pub fn set_statistics(mut self, value: Statistics) -> Self {
1429        self.0.statistics = Some(value);
1430        self
1431    }
1432
1433    /// Clears the statistics for this column chunk.
1434    pub fn clear_statistics(mut self) -> Self {
1435        self.0.statistics = None;
1436        self
1437    }
1438
1439    /// Sets page encoding stats for this column chunk.
1440    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1441        self.0.encoding_stats = Some(value);
1442        self
1443    }
1444
1445    /// Clears the page encoding stats for this column chunk.
1446    pub fn clear_page_encoding_stats(mut self) -> Self {
1447        self.0.encoding_stats = None;
1448        self
1449    }
1450
1451    /// Sets optional bloom filter offset in bytes.
1452    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1453        self.0.bloom_filter_offset = value;
1454        self
1455    }
1456
1457    /// Sets optional bloom filter length in bytes.
1458    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1459        self.0.bloom_filter_length = value;
1460        self
1461    }
1462
1463    /// Sets optional offset index offset in bytes.
1464    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1465        self.0.offset_index_offset = value;
1466        self
1467    }
1468
1469    /// Sets optional offset index length in bytes.
1470    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1471        self.0.offset_index_length = value;
1472        self
1473    }
1474
1475    /// Sets optional column index offset in bytes.
1476    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1477        self.0.column_index_offset = value;
1478        self
1479    }
1480
1481    /// Sets optional column index length in bytes.
1482    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1483        self.0.column_index_length = value;
1484        self
1485    }
1486
1487    /// Sets optional length of variable length data in bytes.
1488    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1489        self.0.unencoded_byte_array_data_bytes = value;
1490        self
1491    }
1492
1493    /// Sets optional repetition level histogram
1494    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1495        self.0.repetition_level_histogram = value;
1496        self
1497    }
1498
1499    /// Sets optional repetition level histogram
1500    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1501        self.0.definition_level_histogram = value;
1502        self
1503    }
1504
1505    #[cfg(feature = "encryption")]
1506    /// Set the encryption metadata for an encrypted column
1507    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1508        self.0.column_crypto_metadata = value;
1509        self
1510    }
1511
1512    /// Builds column chunk metadata.
1513    pub fn build(self) -> Result<ColumnChunkMetaData> {
1514        Ok(self.0)
1515    }
1516}
1517
1518/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1519///
1520/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1521pub struct ColumnIndexBuilder {
1522    null_pages: Vec<bool>,
1523    min_values: Vec<Vec<u8>>,
1524    max_values: Vec<Vec<u8>>,
1525    null_counts: Vec<i64>,
1526    boundary_order: BoundaryOrder,
1527    /// contains the concatenation of the histograms of all pages
1528    repetition_level_histograms: Option<Vec<i64>>,
1529    /// contains the concatenation of the histograms of all pages
1530    definition_level_histograms: Option<Vec<i64>>,
1531    /// Is the information in the builder valid?
1532    ///
1533    /// Set to `false` if any entry in the page doesn't have statistics for
1534    /// some reason, so statistics for that page won't be written to the file.
1535    /// This might happen if the page is entirely null, or
1536    /// is a floating point column without any non-nan values
1537    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1538    valid: bool,
1539}
1540
1541impl Default for ColumnIndexBuilder {
1542    fn default() -> Self {
1543        Self::new()
1544    }
1545}
1546
1547impl ColumnIndexBuilder {
1548    /// Creates a new column index builder.
1549    pub fn new() -> Self {
1550        ColumnIndexBuilder {
1551            null_pages: Vec::new(),
1552            min_values: Vec::new(),
1553            max_values: Vec::new(),
1554            null_counts: Vec::new(),
1555            boundary_order: BoundaryOrder::UNORDERED,
1556            repetition_level_histograms: None,
1557            definition_level_histograms: None,
1558            valid: true,
1559        }
1560    }
1561
1562    /// Append statistics for the next page
1563    pub fn append(
1564        &mut self,
1565        null_page: bool,
1566        min_value: Vec<u8>,
1567        max_value: Vec<u8>,
1568        null_count: i64,
1569    ) {
1570        self.null_pages.push(null_page);
1571        self.min_values.push(min_value);
1572        self.max_values.push(max_value);
1573        self.null_counts.push(null_count);
1574    }
1575
1576    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1577    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1578    pub fn append_histograms(
1579        &mut self,
1580        repetition_level_histogram: &Option<LevelHistogram>,
1581        definition_level_histogram: &Option<LevelHistogram>,
1582    ) {
1583        if !self.valid {
1584            return;
1585        }
1586        if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1587            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1588            hist.reserve(rep_lvl_hist.len());
1589            hist.extend(rep_lvl_hist.values());
1590        }
1591        if let Some(ref def_lvl_hist) = definition_level_histogram {
1592            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1593            hist.reserve(def_lvl_hist.len());
1594            hist.extend(def_lvl_hist.values());
1595        }
1596    }
1597
1598    /// Set the boundary order of the column index
1599    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1600        self.boundary_order = boundary_order;
1601    }
1602
1603    /// Mark this column index as invalid
1604    pub fn to_invalid(&mut self) {
1605        self.valid = false;
1606    }
1607
1608    /// Is the information in the builder valid?
1609    pub fn valid(&self) -> bool {
1610        self.valid
1611    }
1612
1613    /// Build and get the thrift metadata of column index
1614    ///
1615    /// Note: callers should check [`Self::valid`] before calling this method
1616    pub fn build_to_thrift(self) -> ColumnIndex {
1617        ColumnIndex::new(
1618            self.null_pages,
1619            self.min_values,
1620            self.max_values,
1621            self.boundary_order,
1622            self.null_counts,
1623            self.repetition_level_histograms,
1624            self.definition_level_histograms,
1625        )
1626    }
1627}
1628
1629impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1630    fn from(value: ColumnChunkMetaData) -> Self {
1631        ColumnChunkMetaDataBuilder(value)
1632    }
1633}
1634
1635/// Builder for offset index, part of the Parquet [PageIndex].
1636///
1637/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1638pub struct OffsetIndexBuilder {
1639    offset_array: Vec<i64>,
1640    compressed_page_size_array: Vec<i32>,
1641    first_row_index_array: Vec<i64>,
1642    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1643    current_first_row_index: i64,
1644}
1645
1646impl Default for OffsetIndexBuilder {
1647    fn default() -> Self {
1648        Self::new()
1649    }
1650}
1651
1652impl OffsetIndexBuilder {
1653    /// Creates a new offset index builder.
1654    pub fn new() -> Self {
1655        OffsetIndexBuilder {
1656            offset_array: Vec::new(),
1657            compressed_page_size_array: Vec::new(),
1658            first_row_index_array: Vec::new(),
1659            unencoded_byte_array_data_bytes_array: None,
1660            current_first_row_index: 0,
1661        }
1662    }
1663
1664    /// Append the row count of the next page.
1665    pub fn append_row_count(&mut self, row_count: i64) {
1666        let current_page_row_index = self.current_first_row_index;
1667        self.first_row_index_array.push(current_page_row_index);
1668        self.current_first_row_index += row_count;
1669    }
1670
1671    /// Append the offset and size of the next page.
1672    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1673        self.offset_array.push(offset);
1674        self.compressed_page_size_array.push(compressed_page_size);
1675    }
1676
1677    /// Append the unencoded byte array data bytes of the next page.
1678    pub fn append_unencoded_byte_array_data_bytes(
1679        &mut self,
1680        unencoded_byte_array_data_bytes: Option<i64>,
1681    ) {
1682        if let Some(val) = unencoded_byte_array_data_bytes {
1683            self.unencoded_byte_array_data_bytes_array
1684                .get_or_insert(Vec::new())
1685                .push(val);
1686        }
1687    }
1688
1689    /// Build and get the thrift metadata of offset index
1690    pub fn build_to_thrift(self) -> OffsetIndex {
1691        let locations = self
1692            .offset_array
1693            .iter()
1694            .zip(self.compressed_page_size_array.iter())
1695            .zip(self.first_row_index_array.iter())
1696            .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1697            .collect::<Vec<_>>();
1698        OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1699    }
1700}
1701
1702#[cfg(test)]
1703mod tests {
1704    use super::*;
1705    use crate::basic::{PageType, SortOrder};
1706    use crate::file::page_index::index::NativeIndex;
1707
1708    #[test]
1709    fn test_row_group_metadata_thrift_conversion() {
1710        let schema_descr = get_test_schema_descr();
1711
1712        let mut columns = vec![];
1713        for ptr in schema_descr.columns() {
1714            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1715            columns.push(column);
1716        }
1717        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1718            .set_num_rows(1000)
1719            .set_total_byte_size(2000)
1720            .set_column_metadata(columns)
1721            .set_ordinal(1)
1722            .build()
1723            .unwrap();
1724
1725        let row_group_exp = row_group_meta.to_thrift();
1726        let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1727            .unwrap()
1728            .to_thrift();
1729
1730        assert_eq!(row_group_res, row_group_exp);
1731    }
1732
1733    #[test]
1734    fn test_row_group_metadata_thrift_conversion_empty() {
1735        let schema_descr = get_test_schema_descr();
1736
1737        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1738
1739        assert!(row_group_meta.is_err());
1740        if let Err(e) = row_group_meta {
1741            assert_eq!(
1742                format!("{e}"),
1743                "Parquet error: Column length mismatch: 2 != 0"
1744            );
1745        }
1746    }
1747
1748    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1749    #[test]
1750    fn test_row_group_metadata_thrift_corrupted() {
1751        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1752            SchemaType::group_type_builder("schema")
1753                .with_fields(vec![
1754                    Arc::new(
1755                        SchemaType::primitive_type_builder("a", Type::INT32)
1756                            .build()
1757                            .unwrap(),
1758                    ),
1759                    Arc::new(
1760                        SchemaType::primitive_type_builder("b", Type::INT32)
1761                            .build()
1762                            .unwrap(),
1763                    ),
1764                ])
1765                .build()
1766                .unwrap(),
1767        )));
1768
1769        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1770            SchemaType::group_type_builder("schema")
1771                .with_fields(vec![
1772                    Arc::new(
1773                        SchemaType::primitive_type_builder("a", Type::INT32)
1774                            .build()
1775                            .unwrap(),
1776                    ),
1777                    Arc::new(
1778                        SchemaType::primitive_type_builder("b", Type::INT32)
1779                            .build()
1780                            .unwrap(),
1781                    ),
1782                    Arc::new(
1783                        SchemaType::primitive_type_builder("c", Type::INT32)
1784                            .build()
1785                            .unwrap(),
1786                    ),
1787                ])
1788                .build()
1789                .unwrap(),
1790        )));
1791
1792        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1793            .set_num_rows(1000)
1794            .set_total_byte_size(2000)
1795            .set_column_metadata(vec![
1796                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1797                    .build()
1798                    .unwrap(),
1799                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1800                    .build()
1801                    .unwrap(),
1802            ])
1803            .set_ordinal(1)
1804            .build()
1805            .unwrap();
1806
1807        let err =
1808            RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1809                .unwrap_err()
1810                .to_string();
1811        assert_eq!(
1812            err,
1813            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1814        );
1815    }
1816
1817    #[test]
1818    fn test_column_chunk_metadata_thrift_conversion() {
1819        let column_descr = get_test_schema_descr().column(0);
1820
1821        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1822            .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1823            .set_file_path("file_path".to_owned())
1824            .set_num_values(1000)
1825            .set_compression(Compression::SNAPPY)
1826            .set_total_compressed_size(2000)
1827            .set_total_uncompressed_size(3000)
1828            .set_data_page_offset(4000)
1829            .set_dictionary_page_offset(Some(5000))
1830            .set_page_encoding_stats(vec![
1831                PageEncodingStats {
1832                    page_type: PageType::DATA_PAGE,
1833                    encoding: Encoding::PLAIN,
1834                    count: 3,
1835                },
1836                PageEncodingStats {
1837                    page_type: PageType::DATA_PAGE,
1838                    encoding: Encoding::RLE,
1839                    count: 5,
1840                },
1841            ])
1842            .set_bloom_filter_offset(Some(6000))
1843            .set_bloom_filter_length(Some(25))
1844            .set_offset_index_offset(Some(7000))
1845            .set_offset_index_length(Some(25))
1846            .set_column_index_offset(Some(8000))
1847            .set_column_index_length(Some(25))
1848            .set_unencoded_byte_array_data_bytes(Some(2000))
1849            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1850            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1851            .build()
1852            .unwrap();
1853
1854        let col_chunk_res =
1855            ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1856
1857        assert_eq!(col_chunk_res, col_metadata);
1858    }
1859
1860    #[test]
1861    fn test_column_chunk_metadata_thrift_conversion_empty() {
1862        let column_descr = get_test_schema_descr().column(0);
1863
1864        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1865            .build()
1866            .unwrap();
1867
1868        let col_chunk_exp = col_metadata.to_thrift();
1869        let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1870            .unwrap()
1871            .to_thrift();
1872
1873        assert_eq!(col_chunk_res, col_chunk_exp);
1874    }
1875
1876    #[test]
1877    fn test_compressed_size() {
1878        let schema_descr = get_test_schema_descr();
1879
1880        let mut columns = vec![];
1881        for column_descr in schema_descr.columns() {
1882            let column = ColumnChunkMetaData::builder(column_descr.clone())
1883                .set_total_compressed_size(500)
1884                .set_total_uncompressed_size(700)
1885                .build()
1886                .unwrap();
1887            columns.push(column);
1888        }
1889        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1890            .set_num_rows(1000)
1891            .set_column_metadata(columns)
1892            .build()
1893            .unwrap();
1894
1895        let compressed_size_res: i64 = row_group_meta.compressed_size();
1896        let compressed_size_exp: i64 = 1000;
1897
1898        assert_eq!(compressed_size_res, compressed_size_exp);
1899    }
1900
1901    #[test]
1902    fn test_memory_size() {
1903        let schema_descr = get_test_schema_descr();
1904
1905        let columns = schema_descr
1906            .columns()
1907            .iter()
1908            .map(|column_descr| {
1909                ColumnChunkMetaData::builder(column_descr.clone())
1910                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1911                    .build()
1912            })
1913            .collect::<Result<Vec<_>>>()
1914            .unwrap();
1915        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1916            .set_num_rows(1000)
1917            .set_column_metadata(columns)
1918            .build()
1919            .unwrap();
1920        let row_group_meta = vec![row_group_meta];
1921
1922        let version = 2;
1923        let num_rows = 1000;
1924        let created_by = Some(String::from("test harness"));
1925        let key_value_metadata = Some(vec![KeyValue::new(
1926            String::from("Foo"),
1927            Some(String::from("bar")),
1928        )]);
1929        let column_orders = Some(vec![
1930            ColumnOrder::UNDEFINED,
1931            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1932        ]);
1933        let file_metadata = FileMetaData::new(
1934            version,
1935            num_rows,
1936            created_by,
1937            key_value_metadata,
1938            schema_descr.clone(),
1939            column_orders,
1940        );
1941
1942        // Now, add in Exact Statistics
1943        let columns_with_stats = schema_descr
1944            .columns()
1945            .iter()
1946            .map(|column_descr| {
1947                ColumnChunkMetaData::builder(column_descr.clone())
1948                    .set_statistics(Statistics::new::<i32>(
1949                        Some(0),
1950                        Some(100),
1951                        None,
1952                        None,
1953                        false,
1954                    ))
1955                    .build()
1956            })
1957            .collect::<Result<Vec<_>>>()
1958            .unwrap();
1959
1960        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1961            .set_num_rows(1000)
1962            .set_column_metadata(columns_with_stats)
1963            .build()
1964            .unwrap();
1965        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1966
1967        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1968            .set_row_groups(row_group_meta_with_stats)
1969            .build();
1970
1971        #[cfg(not(feature = "encryption"))]
1972        let base_expected_size = 2312;
1973        #[cfg(feature = "encryption")]
1974        let base_expected_size = 2648;
1975
1976        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1977
1978        let mut column_index = ColumnIndexBuilder::new();
1979        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1980        let column_index = column_index.build_to_thrift();
1981        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
1982
1983        // Now, add in OffsetIndex
1984        let mut offset_index = OffsetIndexBuilder::new();
1985        offset_index.append_row_count(1);
1986        offset_index.append_offset_and_size(2, 3);
1987        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1988        offset_index.append_row_count(1);
1989        offset_index.append_offset_and_size(2, 3);
1990        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1991        let offset_index = offset_index.build_to_thrift();
1992
1993        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1994            .set_row_groups(row_group_meta)
1995            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
1996            .set_offset_index(Some(vec![vec![
1997                OffsetIndexMetaData::try_new(offset_index).unwrap()
1998            ]]))
1999            .build();
2000
2001        #[cfg(not(feature = "encryption"))]
2002        let bigger_expected_size = 2816;
2003        #[cfg(feature = "encryption")]
2004        let bigger_expected_size = 3152;
2005
2006        // more set fields means more memory usage
2007        assert!(bigger_expected_size > base_expected_size);
2008        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2009    }
2010
2011    /// Returns sample schema descriptor so we can create column metadata.
2012    fn get_test_schema_descr() -> SchemaDescPtr {
2013        let schema = SchemaType::group_type_builder("schema")
2014            .with_fields(vec![
2015                Arc::new(
2016                    SchemaType::primitive_type_builder("a", Type::INT32)
2017                        .build()
2018                        .unwrap(),
2019                ),
2020                Arc::new(
2021                    SchemaType::primitive_type_builder("b", Type::INT32)
2022                        .build()
2023                        .unwrap(),
2024                ),
2025            ])
2026            .build()
2027            .unwrap();
2028
2029        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2030    }
2031}