parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Most users should use these structures to interact with Parquet metadata.
21//! The [crate::format] module contains lower level structures generated from the
22//! Parquet thrift definition.
23//!
24//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
25//!   file footer.
26//!
27//! * [`FileMetaData`]: File level metadata such as schema, row counts and
28//!   version.
29//!
30//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
31//!   location and number of rows, and column chunks.
32//!
33//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
34//!   within a Row Group including encoding and compression information,
35//!   number of values, statistics, etc.
36//!
37//! # APIs for working with Parquet Metadata
38//!
39//! The Parquet readers and writers in this crate handle reading and writing
40//! metadata into parquet files. To work with metadata directly,
41//! the following APIs are available:
42//!
43//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
44//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
45//! * [`ParquetMetaDataWriter`] for writing.
46//!
47//!
48//! # Examples
49//!
50//! Please see [`external_metadata.rs`]
51//!
52//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
53//!
54//! # Metadata Encodings and Structures
55//!
56//! There are three different encodings of Parquet Metadata in this crate:
57//!
58//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
59//!    [parquet.thrift]
60//!
61//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
62//!    from [parquet.thrift]. These structures are low level and mirror
63//!    the thrift definitions.
64//!
65//! 3. [`file::metadata`] (this module): Easier to use Rust structures
66//!    with a more idiomatic API. Note that, confusingly, some but not all
67//!    of these structures have the same name as the [`format`] structures.
68//!
69//! [`format`]: crate::format
70//! [`file::metadata`]: crate::file::metadata
71//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
72//!
73//! Graphically, this is how the different structures relate to each other:
74//!
75//! ```text
76//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
77//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
78//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
79//!                            └──────────────┘     │         └───────────────────────┘ │
80//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
81//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
82//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
83//!                                     ...         │                   ...             │
84//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
85//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
86//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
87//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
88//!
89//!                          format::meta structures          file::metadata structures
90//!
91//!                         * Same name, different struct
92//! ```
93mod footer_tail;
94mod memory;
95mod parser;
96mod push_decoder;
97pub(crate) mod reader;
98mod writer;
99
100use crate::basic::{ColumnOrder, Compression, Encoding, Type};
101#[cfg(feature = "encryption")]
102use crate::encryption::{
103    decrypt::FileDecryptor,
104    modules::{create_module_aad, ModuleType},
105};
106use crate::errors::{ParquetError, Result};
107#[cfg(feature = "encryption")]
108use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
109pub(crate) use crate::file::metadata::memory::HeapSize;
110use crate::file::page_encoding_stats::{self, PageEncodingStats};
111use crate::file::page_index::index::Index;
112use crate::file::page_index::offset_index::OffsetIndexMetaData;
113use crate::file::statistics::{self, Statistics};
114use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
115use crate::format::{
116    BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
117    SizeStatistics, SortingColumn,
118};
119use crate::geospatial::statistics as geo_statistics;
120use crate::schema::types::{
121    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
122    Type as SchemaType,
123};
124#[cfg(feature = "encryption")]
125use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
126pub use footer_tail::FooterTail;
127pub use push_decoder::ParquetMetaDataPushDecoder;
128pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
129use std::ops::Range;
130use std::sync::Arc;
131pub use writer::ParquetMetaDataWriter;
132pub(crate) use writer::ThriftMetadataWriter;
133
134/// Page level statistics for each column chunk of each row group.
135///
136/// This structure is an in-memory representation of multiple [`ColumnIndex`]
137/// structures in a parquet file footer, as described in the Parquet [PageIndex
138/// documentation]. Each [`Index`] holds statistics about all the pages in a
139/// particular column chunk.
140///
141/// `column_index[row_group_number][column_number]` holds the
142/// [`Index`] corresponding to column `column_number` of row group
143/// `row_group_number`.
144///
145/// For example `column_index[2][3]` holds the [`Index`] for the fourth
146/// column in the third row group of the parquet file.
147///
148/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
149pub type ParquetColumnIndex = Vec<Vec<Index>>;
150
151/// [`OffsetIndexMetaData`] for each data page of each row group of each column
152///
153/// This structure is the parsed representation of the [`OffsetIndex`] from the
154/// Parquet file footer, as described in the Parquet [PageIndex documentation].
155///
156/// `offset_index[row_group_number][column_number]` holds
157/// the [`OffsetIndexMetaData`] corresponding to column
158/// `column_number`of row group `row_group_number`.
159///
160/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
161pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
162
163/// Parsed metadata for a single Parquet file
164///
165/// This structure is stored in the footer of Parquet files, in the format
166/// defined by [`parquet.thrift`].
167///
168/// # Overview
169/// The fields of this structure are:
170/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
171/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
172/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
173///
174/// This structure is read by the various readers in this crate or can be read
175/// directly from a file using the [`ParquetMetaDataReader`] struct.
176///
177/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
178///
179/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
180#[derive(Debug, Clone, PartialEq)]
181pub struct ParquetMetaData {
182    /// File level metadata
183    file_metadata: FileMetaData,
184    /// Row group metadata
185    row_groups: Vec<RowGroupMetaData>,
186    /// Page level index for each page in each column chunk
187    column_index: Option<ParquetColumnIndex>,
188    /// Offset index for each page in each column chunk
189    offset_index: Option<ParquetOffsetIndex>,
190    /// Optional file decryptor
191    #[cfg(feature = "encryption")]
192    file_decryptor: Option<FileDecryptor>,
193}
194
195impl ParquetMetaData {
196    /// Creates Parquet metadata from file metadata and a list of row
197    /// group metadata
198    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
199        ParquetMetaData {
200            file_metadata,
201            row_groups,
202            column_index: None,
203            offset_index: None,
204            #[cfg(feature = "encryption")]
205            file_decryptor: None,
206        }
207    }
208
209    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
210    /// encrypted data.
211    #[cfg(feature = "encryption")]
212    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
213        self.file_decryptor = file_decryptor;
214    }
215
216    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
217    pub fn into_builder(self) -> ParquetMetaDataBuilder {
218        self.into()
219    }
220
221    /// Returns file metadata as reference.
222    pub fn file_metadata(&self) -> &FileMetaData {
223        &self.file_metadata
224    }
225
226    /// Returns file decryptor as reference.
227    #[cfg(feature = "encryption")]
228    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
229        self.file_decryptor.as_ref()
230    }
231
232    /// Returns number of row groups in this file.
233    pub fn num_row_groups(&self) -> usize {
234        self.row_groups.len()
235    }
236
237    /// Returns row group metadata for `i`th position.
238    /// Position should be less than number of row groups `num_row_groups`.
239    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
240        &self.row_groups[i]
241    }
242
243    /// Returns slice of row groups in this file.
244    pub fn row_groups(&self) -> &[RowGroupMetaData] {
245        &self.row_groups
246    }
247
248    /// Returns the column index for this file if loaded
249    ///
250    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
251    /// [ArrowReaderOptions::with_page_index] was set to false.
252    ///
253    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
254    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
255        self.column_index.as_ref()
256    }
257
258    /// Returns offset indexes in this file, if loaded
259    ///
260    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
261    /// [ArrowReaderOptions::with_page_index] was set to false.
262    ///
263    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
264    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
265        self.offset_index.as_ref()
266    }
267
268    /// Estimate of the bytes allocated to store `ParquetMetadata`
269    ///
270    /// # Notes:
271    ///
272    /// 1. Includes size of self
273    ///
274    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
275    ///    [`RowGroupMetaData`].
276    ///
277    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
278    ///    means `memory_size` will over estimate the memory size if such pointers
279    ///    are shared.
280    ///
281    /// 4. Does not include any allocator overheads
282    pub fn memory_size(&self) -> usize {
283        std::mem::size_of::<Self>()
284            + self.file_metadata.heap_size()
285            + self.row_groups.heap_size()
286            + self.column_index.heap_size()
287            + self.offset_index.heap_size()
288    }
289
290    /// Override the column index
291    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
292        self.column_index = index;
293    }
294
295    /// Override the offset index
296    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
297        self.offset_index = index;
298    }
299}
300
301/// A builder for creating / manipulating [`ParquetMetaData`]
302///
303/// # Example creating a new [`ParquetMetaData`]
304///
305///```no_run
306/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
307/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
308/// // Create a new builder given the file metadata
309/// let file_metadata = get_file_metadata();
310/// // Create a row group
311/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
312///    .set_num_rows(100)
313///    // ... (A real row group needs more than just the number of rows)
314///    .build()
315///    .unwrap();
316/// // Create the final metadata
317/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
318///   .add_row_group(row_group)
319///   .build();
320/// ```
321///
322/// # Example modifying an existing [`ParquetMetaData`]
323/// ```no_run
324/// # use parquet::file::metadata::ParquetMetaData;
325/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
326/// // Modify the metadata so only the last RowGroup remains
327/// let metadata: ParquetMetaData = load_metadata();
328/// let mut builder = metadata.into_builder();
329///
330/// // Take existing row groups to modify
331/// let mut row_groups = builder.take_row_groups();
332/// let last_row_group = row_groups.pop().unwrap();
333///
334/// let metadata = builder
335///   .add_row_group(last_row_group)
336///   .build();
337/// ```
338pub struct ParquetMetaDataBuilder(ParquetMetaData);
339
340impl ParquetMetaDataBuilder {
341    /// Create a new builder from a file metadata, with no row groups
342    pub fn new(file_meta_data: FileMetaData) -> Self {
343        Self(ParquetMetaData::new(file_meta_data, vec![]))
344    }
345
346    /// Create a new builder from an existing ParquetMetaData
347    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
348        Self(metadata)
349    }
350
351    /// Adds a row group to the metadata
352    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
353        self.0.row_groups.push(row_group);
354        self
355    }
356
357    /// Sets all the row groups to the specified list
358    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
359        self.0.row_groups = row_groups;
360        self
361    }
362
363    /// Takes ownership of the row groups in this builder, and clears the list
364    /// of row groups.
365    ///
366    /// This can be used for more efficient creation of a new ParquetMetaData
367    /// from an existing one.
368    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
369        std::mem::take(&mut self.0.row_groups)
370    }
371
372    /// Return a reference to the current row groups
373    pub fn row_groups(&self) -> &[RowGroupMetaData] {
374        &self.0.row_groups
375    }
376
377    /// Sets the column index
378    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
379        self.0.column_index = column_index;
380        self
381    }
382
383    /// Returns the current column index from the builder, replacing it with `None`
384    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
385        std::mem::take(&mut self.0.column_index)
386    }
387
388    /// Return a reference to the current column index, if any
389    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
390        self.0.column_index.as_ref()
391    }
392
393    /// Sets the offset index
394    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
395        self.0.offset_index = offset_index;
396        self
397    }
398
399    /// Returns the current offset index from the builder, replacing it with `None`
400    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
401        std::mem::take(&mut self.0.offset_index)
402    }
403
404    /// Return a reference to the current offset index, if any
405    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
406        self.0.offset_index.as_ref()
407    }
408
409    /// Creates a new ParquetMetaData from the builder
410    pub fn build(self) -> ParquetMetaData {
411        let Self(metadata) = self;
412        metadata
413    }
414}
415
416impl From<ParquetMetaData> for ParquetMetaDataBuilder {
417    fn from(meta_data: ParquetMetaData) -> Self {
418        Self(meta_data)
419    }
420}
421
422/// A key-value pair for [`FileMetaData`].
423pub type KeyValue = crate::format::KeyValue;
424
425/// Reference counted pointer for [`FileMetaData`].
426pub type FileMetaDataPtr = Arc<FileMetaData>;
427
428/// File level metadata for a Parquet file.
429///
430/// Includes the version of the file, metadata, number of rows, schema, and column orders
431#[derive(Debug, Clone, PartialEq)]
432pub struct FileMetaData {
433    version: i32,
434    num_rows: i64,
435    created_by: Option<String>,
436    key_value_metadata: Option<Vec<KeyValue>>,
437    schema_descr: SchemaDescPtr,
438    column_orders: Option<Vec<ColumnOrder>>,
439}
440
441impl FileMetaData {
442    /// Creates new file metadata.
443    pub fn new(
444        version: i32,
445        num_rows: i64,
446        created_by: Option<String>,
447        key_value_metadata: Option<Vec<KeyValue>>,
448        schema_descr: SchemaDescPtr,
449        column_orders: Option<Vec<ColumnOrder>>,
450    ) -> Self {
451        FileMetaData {
452            version,
453            num_rows,
454            created_by,
455            key_value_metadata,
456            schema_descr,
457            column_orders,
458        }
459    }
460
461    /// Returns version of this file.
462    pub fn version(&self) -> i32 {
463        self.version
464    }
465
466    /// Returns number of rows in the file.
467    pub fn num_rows(&self) -> i64 {
468        self.num_rows
469    }
470
471    /// String message for application that wrote this file.
472    ///
473    /// This should have the following format:
474    /// `<application> version <application version> (build <application build hash>)`.
475    ///
476    /// ```shell
477    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
478    /// ```
479    pub fn created_by(&self) -> Option<&str> {
480        self.created_by.as_deref()
481    }
482
483    /// Returns key_value_metadata of this file.
484    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
485        self.key_value_metadata.as_ref()
486    }
487
488    /// Returns Parquet [`Type`] that describes schema in this file.
489    ///
490    /// [`Type`]: crate::schema::types::Type
491    pub fn schema(&self) -> &SchemaType {
492        self.schema_descr.root_schema()
493    }
494
495    /// Returns a reference to schema descriptor.
496    pub fn schema_descr(&self) -> &SchemaDescriptor {
497        &self.schema_descr
498    }
499
500    /// Returns reference counted clone for schema descriptor.
501    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
502        self.schema_descr.clone()
503    }
504
505    /// Column (sort) order used for `min` and `max` values of each column in this file.
506    ///
507    /// Each column order corresponds to one column, determined by its position in the
508    /// list, matching the position of the column in the schema.
509    ///
510    /// When `None` is returned, there are no column orders available, and each column
511    /// should be assumed to have undefined (legacy) column order.
512    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
513        self.column_orders.as_ref()
514    }
515
516    /// Returns column order for `i`th column in this file.
517    /// If column orders are not available, returns undefined (legacy) column order.
518    pub fn column_order(&self, i: usize) -> ColumnOrder {
519        self.column_orders
520            .as_ref()
521            .map(|data| data[i])
522            .unwrap_or(ColumnOrder::UNDEFINED)
523    }
524}
525
526/// Reference counted pointer for [`RowGroupMetaData`].
527pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
528
529/// Metadata for a row group
530///
531/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
532/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
533#[derive(Debug, Clone, PartialEq)]
534pub struct RowGroupMetaData {
535    columns: Vec<ColumnChunkMetaData>,
536    num_rows: i64,
537    sorting_columns: Option<Vec<SortingColumn>>,
538    total_byte_size: i64,
539    schema_descr: SchemaDescPtr,
540    /// We can't infer from file offset of first column since there may empty columns in row group.
541    file_offset: Option<i64>,
542    /// Ordinal position of this row group in file
543    ordinal: Option<i16>,
544}
545
546impl RowGroupMetaData {
547    /// Returns builder for row group metadata.
548    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
549        RowGroupMetaDataBuilder::new(schema_descr)
550    }
551
552    /// Number of columns in this row group.
553    pub fn num_columns(&self) -> usize {
554        self.columns.len()
555    }
556
557    /// Returns column chunk metadata for `i`th column.
558    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
559        &self.columns[i]
560    }
561
562    /// Returns slice of column chunk metadata.
563    pub fn columns(&self) -> &[ColumnChunkMetaData] {
564        &self.columns
565    }
566
567    /// Returns mutable slice of column chunk metadata.
568    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
569        &mut self.columns
570    }
571
572    /// Number of rows in this row group.
573    pub fn num_rows(&self) -> i64 {
574        self.num_rows
575    }
576
577    /// Returns the sort ordering of the rows in this RowGroup if any
578    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
579        self.sorting_columns.as_ref()
580    }
581
582    /// Total byte size of all uncompressed column data in this row group.
583    pub fn total_byte_size(&self) -> i64 {
584        self.total_byte_size
585    }
586
587    /// Total size of all compressed column data in this row group.
588    pub fn compressed_size(&self) -> i64 {
589        self.columns.iter().map(|c| c.total_compressed_size).sum()
590    }
591
592    /// Returns reference to a schema descriptor.
593    pub fn schema_descr(&self) -> &SchemaDescriptor {
594        self.schema_descr.as_ref()
595    }
596
597    /// Returns reference counted clone of schema descriptor.
598    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
599        self.schema_descr.clone()
600    }
601
602    /// Returns ordinal position of this row group in file.
603    ///
604    /// For example if this is the first row group in the file, this will return 0.
605    /// If this is the second row group in the file, this will return 1.
606    #[inline(always)]
607    pub fn ordinal(&self) -> Option<i16> {
608        self.ordinal
609    }
610
611    /// Returns file offset of this row group in file.
612    #[inline(always)]
613    pub fn file_offset(&self) -> Option<i64> {
614        self.file_offset
615    }
616
617    /// Method to convert from encrypted Thrift.
618    #[cfg(feature = "encryption")]
619    fn from_encrypted_thrift(
620        schema_descr: SchemaDescPtr,
621        mut rg: RowGroup,
622        decryptor: Option<&FileDecryptor>,
623    ) -> Result<RowGroupMetaData> {
624        if schema_descr.num_columns() != rg.columns.len() {
625            return Err(general_err!(
626                "Column count mismatch. Schema has {} columns while Row Group has {}",
627                schema_descr.num_columns(),
628                rg.columns.len()
629            ));
630        }
631        let total_byte_size = rg.total_byte_size;
632        let num_rows = rg.num_rows;
633        let mut columns = vec![];
634
635        for (i, (mut c, d)) in rg
636            .columns
637            .drain(0..)
638            .zip(schema_descr.columns())
639            .enumerate()
640        {
641            // Read encrypted metadata if it's present and we have a decryptor.
642            if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
643                let column_decryptor = match c.crypto_metadata.as_ref() {
644                    None => {
645                        return Err(general_err!(
646                            "No crypto_metadata is set for column '{}', which has encrypted metadata",
647                            d.path().string()
648                        ));
649                    }
650                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
651                        let column_name = crypto_metadata.path_in_schema.join(".");
652                        decryptor.get_column_metadata_decryptor(
653                            column_name.as_str(),
654                            crypto_metadata.key_metadata.as_deref(),
655                        )?
656                    }
657                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
658                        decryptor.get_footer_decryptor()?
659                    }
660                };
661
662                let column_aad = create_module_aad(
663                    decryptor.file_aad(),
664                    ModuleType::ColumnMetaData,
665                    rg.ordinal.unwrap() as usize,
666                    i,
667                    None,
668                )?;
669
670                let buf = c.encrypted_column_metadata.clone().unwrap();
671                let decrypted_cc_buf = column_decryptor
672                    .decrypt(buf.as_slice(), column_aad.as_ref())
673                    .map_err(|_| {
674                        general_err!(
675                            "Unable to decrypt column '{}', perhaps the column key is wrong?",
676                            d.path().string()
677                        )
678                    })?;
679
680                let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
681                c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
682            }
683            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
684        }
685
686        let sorting_columns = rg.sorting_columns;
687        Ok(RowGroupMetaData {
688            columns,
689            num_rows,
690            sorting_columns,
691            total_byte_size,
692            schema_descr,
693            file_offset: rg.file_offset,
694            ordinal: rg.ordinal,
695        })
696    }
697
698    /// Method to convert from Thrift.
699    pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
700        if schema_descr.num_columns() != rg.columns.len() {
701            return Err(general_err!(
702                "Column count mismatch. Schema has {} columns while Row Group has {}",
703                schema_descr.num_columns(),
704                rg.columns.len()
705            ));
706        }
707        let total_byte_size = rg.total_byte_size;
708        let num_rows = rg.num_rows;
709        let mut columns = vec![];
710
711        for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
712            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
713        }
714
715        let sorting_columns = rg.sorting_columns;
716        Ok(RowGroupMetaData {
717            columns,
718            num_rows,
719            sorting_columns,
720            total_byte_size,
721            schema_descr,
722            file_offset: rg.file_offset,
723            ordinal: rg.ordinal,
724        })
725    }
726
727    /// Method to convert to Thrift.
728    pub fn to_thrift(&self) -> RowGroup {
729        RowGroup {
730            columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
731            total_byte_size: self.total_byte_size,
732            num_rows: self.num_rows,
733            sorting_columns: self.sorting_columns().cloned(),
734            file_offset: self.file_offset(),
735            total_compressed_size: Some(self.compressed_size()),
736            ordinal: self.ordinal,
737        }
738    }
739
740    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
741    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
742        RowGroupMetaDataBuilder(self)
743    }
744}
745
746/// Builder for row group metadata.
747pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
748
749impl RowGroupMetaDataBuilder {
750    /// Creates new builder from schema descriptor.
751    fn new(schema_descr: SchemaDescPtr) -> Self {
752        Self(RowGroupMetaData {
753            columns: Vec::with_capacity(schema_descr.num_columns()),
754            schema_descr,
755            file_offset: None,
756            num_rows: 0,
757            sorting_columns: None,
758            total_byte_size: 0,
759            ordinal: None,
760        })
761    }
762
763    /// Sets number of rows in this row group.
764    pub fn set_num_rows(mut self, value: i64) -> Self {
765        self.0.num_rows = value;
766        self
767    }
768
769    /// Sets the sorting order for columns
770    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
771        self.0.sorting_columns = value;
772        self
773    }
774
775    /// Sets total size in bytes for this row group.
776    pub fn set_total_byte_size(mut self, value: i64) -> Self {
777        self.0.total_byte_size = value;
778        self
779    }
780
781    /// Takes ownership of the the column metadata in this builder, and clears
782    /// the list of columns.
783    ///
784    /// This can be used for more efficient creation of a new RowGroupMetaData
785    /// from an existing one.
786    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
787        std::mem::take(&mut self.0.columns)
788    }
789
790    /// Sets column metadata for this row group.
791    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
792        self.0.columns = value;
793        self
794    }
795
796    /// Adds a column metadata to this row group
797    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
798        self.0.columns.push(value);
799        self
800    }
801
802    /// Sets ordinal for this row group.
803    pub fn set_ordinal(mut self, value: i16) -> Self {
804        self.0.ordinal = Some(value);
805        self
806    }
807
808    /// Sets file offset for this row group.
809    pub fn set_file_offset(mut self, value: i64) -> Self {
810        self.0.file_offset = Some(value);
811        self
812    }
813
814    /// Builds row group metadata.
815    pub fn build(self) -> Result<RowGroupMetaData> {
816        if self.0.schema_descr.num_columns() != self.0.columns.len() {
817            return Err(general_err!(
818                "Column length mismatch: {} != {}",
819                self.0.schema_descr.num_columns(),
820                self.0.columns.len()
821            ));
822        }
823
824        Ok(self.0)
825    }
826}
827
828/// Metadata for a column chunk.
829#[derive(Debug, Clone, PartialEq)]
830pub struct ColumnChunkMetaData {
831    column_descr: ColumnDescPtr,
832    encodings: Vec<Encoding>,
833    file_path: Option<String>,
834    file_offset: i64,
835    num_values: i64,
836    compression: Compression,
837    total_compressed_size: i64,
838    total_uncompressed_size: i64,
839    data_page_offset: i64,
840    index_page_offset: Option<i64>,
841    dictionary_page_offset: Option<i64>,
842    statistics: Option<Statistics>,
843    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
844    encoding_stats: Option<Vec<PageEncodingStats>>,
845    bloom_filter_offset: Option<i64>,
846    bloom_filter_length: Option<i32>,
847    offset_index_offset: Option<i64>,
848    offset_index_length: Option<i32>,
849    column_index_offset: Option<i64>,
850    column_index_length: Option<i32>,
851    unencoded_byte_array_data_bytes: Option<i64>,
852    repetition_level_histogram: Option<LevelHistogram>,
853    definition_level_histogram: Option<LevelHistogram>,
854    #[cfg(feature = "encryption")]
855    column_crypto_metadata: Option<ColumnCryptoMetaData>,
856}
857
858/// Histograms for repetition and definition levels.
859///
860/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
861/// values at level `i`.
862///
863/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
864/// number of rows with level 1, and so on.
865///
866#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
867pub struct LevelHistogram {
868    inner: Vec<i64>,
869}
870
871impl LevelHistogram {
872    /// Creates a new level histogram data.
873    ///
874    /// Length will be `max_level + 1`.
875    ///
876    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
877    pub fn try_new(max_level: i16) -> Option<Self> {
878        if max_level > 0 {
879            Some(Self {
880                inner: vec![0; max_level as usize + 1],
881            })
882        } else {
883            None
884        }
885    }
886    /// Returns a reference to the the histogram's values.
887    pub fn values(&self) -> &[i64] {
888        &self.inner
889    }
890
891    /// Return the inner vector, consuming self
892    pub fn into_inner(self) -> Vec<i64> {
893        self.inner
894    }
895
896    /// Returns the histogram value at the given index.
897    ///
898    /// The value of `i` is the number of values with level `i`. For example,
899    /// `get(1)` returns the number of values with level 1.
900    ///
901    /// Returns `None` if the index is out of bounds.
902    pub fn get(&self, index: usize) -> Option<i64> {
903        self.inner.get(index).copied()
904    }
905
906    /// Adds the values from the other histogram to this histogram
907    ///
908    /// # Panics
909    /// If the histograms have different lengths
910    pub fn add(&mut self, other: &Self) {
911        assert_eq!(self.len(), other.len());
912        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
913            *dst += src;
914        }
915    }
916
917    /// return the length of the histogram
918    pub fn len(&self) -> usize {
919        self.inner.len()
920    }
921
922    /// returns if the histogram is empty
923    pub fn is_empty(&self) -> bool {
924        self.inner.is_empty()
925    }
926
927    /// Sets the values of all histogram levels to 0.
928    pub fn reset(&mut self) {
929        for value in self.inner.iter_mut() {
930            *value = 0;
931        }
932    }
933
934    /// Updates histogram values using provided repetition levels
935    ///
936    /// # Panics
937    /// if any of the levels is greater than the length of the histogram (
938    /// the argument supplied to [`Self::try_new`])
939    pub fn update_from_levels(&mut self, levels: &[i16]) {
940        for &level in levels {
941            self.inner[level as usize] += 1;
942        }
943    }
944}
945
946impl From<Vec<i64>> for LevelHistogram {
947    fn from(inner: Vec<i64>) -> Self {
948        Self { inner }
949    }
950}
951
952impl From<LevelHistogram> for Vec<i64> {
953    fn from(value: LevelHistogram) -> Self {
954        value.into_inner()
955    }
956}
957
958impl HeapSize for LevelHistogram {
959    fn heap_size(&self) -> usize {
960        self.inner.heap_size()
961    }
962}
963
964/// Represents common operations for a column chunk.
965impl ColumnChunkMetaData {
966    /// Returns builder for column chunk metadata.
967    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
968        ColumnChunkMetaDataBuilder::new(column_descr)
969    }
970
971    /// File where the column chunk is stored.
972    ///
973    /// If not set, assumed to belong to the same file as the metadata.
974    /// This path is relative to the current file.
975    pub fn file_path(&self) -> Option<&str> {
976        self.file_path.as_deref()
977    }
978
979    /// Byte offset of `ColumnMetaData` in `file_path()`.
980    ///
981    /// Note that the meaning of this field has been inconsistent between implementations
982    /// so its use has since been deprecated in the Parquet specification. Modern implementations
983    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
984    /// `ColumnChunk` struct.
985    pub fn file_offset(&self) -> i64 {
986        self.file_offset
987    }
988
989    /// Type of this column. Must be primitive.
990    pub fn column_type(&self) -> Type {
991        self.column_descr.physical_type()
992    }
993
994    /// Path (or identifier) of this column.
995    pub fn column_path(&self) -> &ColumnPath {
996        self.column_descr.path()
997    }
998
999    /// Descriptor for this column.
1000    pub fn column_descr(&self) -> &ColumnDescriptor {
1001        self.column_descr.as_ref()
1002    }
1003
1004    /// Reference counted clone of descriptor for this column.
1005    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1006        self.column_descr.clone()
1007    }
1008
1009    /// All encodings used for this column.
1010    pub fn encodings(&self) -> &Vec<Encoding> {
1011        &self.encodings
1012    }
1013
1014    /// Total number of values in this column chunk.
1015    pub fn num_values(&self) -> i64 {
1016        self.num_values
1017    }
1018
1019    /// Compression for this column.
1020    pub fn compression(&self) -> Compression {
1021        self.compression
1022    }
1023
1024    /// Returns the total compressed data size of this column chunk.
1025    pub fn compressed_size(&self) -> i64 {
1026        self.total_compressed_size
1027    }
1028
1029    /// Returns the total uncompressed data size of this column chunk.
1030    pub fn uncompressed_size(&self) -> i64 {
1031        self.total_uncompressed_size
1032    }
1033
1034    /// Returns the offset for the column data.
1035    pub fn data_page_offset(&self) -> i64 {
1036        self.data_page_offset
1037    }
1038
1039    /// Returns the offset for the index page.
1040    pub fn index_page_offset(&self) -> Option<i64> {
1041        self.index_page_offset
1042    }
1043
1044    /// Returns the offset for the dictionary page, if any.
1045    pub fn dictionary_page_offset(&self) -> Option<i64> {
1046        self.dictionary_page_offset
1047    }
1048
1049    /// Returns the offset and length in bytes of the column chunk within the file
1050    pub fn byte_range(&self) -> (u64, u64) {
1051        let col_start = match self.dictionary_page_offset() {
1052            Some(dictionary_page_offset) => dictionary_page_offset,
1053            None => self.data_page_offset(),
1054        };
1055        let col_len = self.compressed_size();
1056        assert!(
1057            col_start >= 0 && col_len >= 0,
1058            "column start and length should not be negative"
1059        );
1060        (col_start as u64, col_len as u64)
1061    }
1062
1063    /// Returns statistics that are set for this column chunk,
1064    /// or `None` if no statistics are available.
1065    pub fn statistics(&self) -> Option<&Statistics> {
1066        self.statistics.as_ref()
1067    }
1068
1069    /// Returns geospatial statistics that are set for this column chunk,
1070    /// or `None` if no geospatial statistics are available.
1071    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1072        self.geo_statistics.as_deref()
1073    }
1074
1075    /// Returns the offset for the page encoding stats,
1076    /// or `None` if no page encoding stats are available.
1077    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1078        self.encoding_stats.as_ref()
1079    }
1080
1081    /// Returns the offset for the bloom filter.
1082    pub fn bloom_filter_offset(&self) -> Option<i64> {
1083        self.bloom_filter_offset
1084    }
1085
1086    /// Returns the offset for the bloom filter.
1087    pub fn bloom_filter_length(&self) -> Option<i32> {
1088        self.bloom_filter_length
1089    }
1090
1091    /// Returns the offset for the column index.
1092    pub fn column_index_offset(&self) -> Option<i64> {
1093        self.column_index_offset
1094    }
1095
1096    /// Returns the offset for the column index length.
1097    pub fn column_index_length(&self) -> Option<i32> {
1098        self.column_index_length
1099    }
1100
1101    /// Returns the range for the offset index if any
1102    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1103        let offset = u64::try_from(self.column_index_offset?).ok()?;
1104        let length = u64::try_from(self.column_index_length?).ok()?;
1105        Some(offset..(offset + length))
1106    }
1107
1108    /// Returns the offset for the offset index.
1109    pub fn offset_index_offset(&self) -> Option<i64> {
1110        self.offset_index_offset
1111    }
1112
1113    /// Returns the offset for the offset index length.
1114    pub fn offset_index_length(&self) -> Option<i32> {
1115        self.offset_index_length
1116    }
1117
1118    /// Returns the range for the offset index if any
1119    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1120        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1121        let length = u64::try_from(self.offset_index_length?).ok()?;
1122        Some(offset..(offset + length))
1123    }
1124
1125    /// Returns the number of bytes of variable length data after decoding.
1126    ///
1127    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1128    /// writers.
1129    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1130        self.unencoded_byte_array_data_bytes
1131    }
1132
1133    /// Returns the repetition level histogram.
1134    ///
1135    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1136    /// `vec[0]` indicates how many rows the page contains.
1137    /// This field may not be set by older writers.
1138    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1139        self.repetition_level_histogram.as_ref()
1140    }
1141
1142    /// Returns the definition level histogram.
1143    ///
1144    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1145    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1146    /// This field may not be set by older writers.
1147    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1148        self.definition_level_histogram.as_ref()
1149    }
1150
1151    /// Returns the encryption metadata for this column chunk.
1152    #[cfg(feature = "encryption")]
1153    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1154        self.column_crypto_metadata.as_ref()
1155    }
1156
1157    /// Method to convert from Thrift.
1158    pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1159        if cc.meta_data.is_none() {
1160            return Err(general_err!("Expected to have column metadata"));
1161        }
1162        let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1163        let column_type = Type::try_from(col_metadata.type_)?;
1164        let encodings = col_metadata
1165            .encodings
1166            .drain(0..)
1167            .map(Encoding::try_from)
1168            .collect::<Result<_>>()?;
1169        let compression = Compression::try_from(col_metadata.codec)?;
1170        let file_path = cc.file_path;
1171        let file_offset = cc.file_offset;
1172        let num_values = col_metadata.num_values;
1173        let total_compressed_size = col_metadata.total_compressed_size;
1174        let total_uncompressed_size = col_metadata.total_uncompressed_size;
1175        let data_page_offset = col_metadata.data_page_offset;
1176        let index_page_offset = col_metadata.index_page_offset;
1177        let dictionary_page_offset = col_metadata.dictionary_page_offset;
1178        let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1179        let geo_statistics =
1180            geo_statistics::from_thrift(col_metadata.geospatial_statistics).map(Box::new);
1181        let encoding_stats = col_metadata
1182            .encoding_stats
1183            .as_ref()
1184            .map(|vec| {
1185                vec.iter()
1186                    .map(page_encoding_stats::try_from_thrift)
1187                    .collect::<Result<_>>()
1188            })
1189            .transpose()?;
1190        let bloom_filter_offset = col_metadata.bloom_filter_offset;
1191        let bloom_filter_length = col_metadata.bloom_filter_length;
1192        let offset_index_offset = cc.offset_index_offset;
1193        let offset_index_length = cc.offset_index_length;
1194        let column_index_offset = cc.column_index_offset;
1195        let column_index_length = cc.column_index_length;
1196        let (
1197            unencoded_byte_array_data_bytes,
1198            repetition_level_histogram,
1199            definition_level_histogram,
1200        ) = if let Some(size_stats) = col_metadata.size_statistics {
1201            (
1202                size_stats.unencoded_byte_array_data_bytes,
1203                size_stats.repetition_level_histogram,
1204                size_stats.definition_level_histogram,
1205            )
1206        } else {
1207            (None, None, None)
1208        };
1209
1210        let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1211        let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1212
1213        #[cfg(feature = "encryption")]
1214        let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1215            Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1216        } else {
1217            None
1218        };
1219
1220        let result = ColumnChunkMetaData {
1221            column_descr,
1222            encodings,
1223            file_path,
1224            file_offset,
1225            num_values,
1226            compression,
1227            total_compressed_size,
1228            total_uncompressed_size,
1229            data_page_offset,
1230            index_page_offset,
1231            dictionary_page_offset,
1232            statistics,
1233            encoding_stats,
1234            bloom_filter_offset,
1235            bloom_filter_length,
1236            offset_index_offset,
1237            offset_index_length,
1238            column_index_offset,
1239            column_index_length,
1240            unencoded_byte_array_data_bytes,
1241            repetition_level_histogram,
1242            definition_level_histogram,
1243            geo_statistics,
1244            #[cfg(feature = "encryption")]
1245            column_crypto_metadata,
1246        };
1247        Ok(result)
1248    }
1249
1250    /// Method to convert to Thrift.
1251    pub fn to_thrift(&self) -> ColumnChunk {
1252        let column_metadata = self.to_column_metadata_thrift();
1253
1254        ColumnChunk {
1255            file_path: self.file_path().map(|s| s.to_owned()),
1256            file_offset: self.file_offset,
1257            meta_data: Some(column_metadata),
1258            offset_index_offset: self.offset_index_offset,
1259            offset_index_length: self.offset_index_length,
1260            column_index_offset: self.column_index_offset,
1261            column_index_length: self.column_index_length,
1262            crypto_metadata: self.column_crypto_metadata_thrift(),
1263            encrypted_column_metadata: None,
1264        }
1265    }
1266
1267    /// Method to convert to Thrift `ColumnMetaData`
1268    pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1269        let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1270            || self.repetition_level_histogram.is_some()
1271            || self.definition_level_histogram.is_some()
1272        {
1273            let repetition_level_histogram = self
1274                .repetition_level_histogram
1275                .as_ref()
1276                .map(|hist| hist.clone().into_inner());
1277
1278            let definition_level_histogram = self
1279                .definition_level_histogram
1280                .as_ref()
1281                .map(|hist| hist.clone().into_inner());
1282
1283            Some(SizeStatistics {
1284                unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1285                repetition_level_histogram,
1286                definition_level_histogram,
1287            })
1288        } else {
1289            None
1290        };
1291
1292        ColumnMetaData {
1293            type_: self.column_type().into(),
1294            encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1295            path_in_schema: self.column_path().as_ref().to_vec(),
1296            codec: self.compression.into(),
1297            num_values: self.num_values,
1298            total_uncompressed_size: self.total_uncompressed_size,
1299            total_compressed_size: self.total_compressed_size,
1300            key_value_metadata: None,
1301            data_page_offset: self.data_page_offset,
1302            index_page_offset: self.index_page_offset,
1303            dictionary_page_offset: self.dictionary_page_offset,
1304            statistics: statistics::to_thrift(self.statistics.as_ref()),
1305            encoding_stats: self
1306                .encoding_stats
1307                .as_ref()
1308                .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1309            bloom_filter_offset: self.bloom_filter_offset,
1310            bloom_filter_length: self.bloom_filter_length,
1311            size_statistics,
1312            geospatial_statistics: geo_statistics::to_thrift(
1313                self.geo_statistics.as_ref().map(|boxed| boxed.as_ref()),
1314            ),
1315        }
1316    }
1317
1318    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1319    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1320        ColumnChunkMetaDataBuilder::from(self)
1321    }
1322
1323    #[cfg(feature = "encryption")]
1324    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1325        self.column_crypto_metadata
1326            .as_ref()
1327            .map(column_crypto_metadata::to_thrift)
1328    }
1329
1330    #[cfg(not(feature = "encryption"))]
1331    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1332        None
1333    }
1334}
1335
1336/// Builder for [`ColumnChunkMetaData`]
1337///
1338/// This builder is used to create a new column chunk metadata or modify an
1339/// existing one.
1340///
1341/// # Example
1342/// ```no_run
1343/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1344/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1345/// let column_chunk_metadata = get_column_chunk_metadata();
1346/// // create a new builder from existing column chunk metadata
1347/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1348/// // clear the statistics:
1349/// let column_chunk_metadata: ColumnChunkMetaData = builder
1350///   .clear_statistics()
1351///   .build()
1352///   .unwrap();
1353/// ```
1354pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1355
1356impl ColumnChunkMetaDataBuilder {
1357    /// Creates new column chunk metadata builder.
1358    ///
1359    /// See also [`ColumnChunkMetaData::builder`]
1360    fn new(column_descr: ColumnDescPtr) -> Self {
1361        Self(ColumnChunkMetaData {
1362            column_descr,
1363            encodings: Vec::new(),
1364            file_path: None,
1365            file_offset: 0,
1366            num_values: 0,
1367            compression: Compression::UNCOMPRESSED,
1368            total_compressed_size: 0,
1369            total_uncompressed_size: 0,
1370            data_page_offset: 0,
1371            index_page_offset: None,
1372            dictionary_page_offset: None,
1373            statistics: None,
1374            geo_statistics: None,
1375            encoding_stats: None,
1376            bloom_filter_offset: None,
1377            bloom_filter_length: None,
1378            offset_index_offset: None,
1379            offset_index_length: None,
1380            column_index_offset: None,
1381            column_index_length: None,
1382            unencoded_byte_array_data_bytes: None,
1383            repetition_level_histogram: None,
1384            definition_level_histogram: None,
1385            #[cfg(feature = "encryption")]
1386            column_crypto_metadata: None,
1387        })
1388    }
1389
1390    /// Sets list of encodings for this column chunk.
1391    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1392        self.0.encodings = encodings;
1393        self
1394    }
1395
1396    /// Sets optional file path for this column chunk.
1397    pub fn set_file_path(mut self, value: String) -> Self {
1398        self.0.file_path = Some(value);
1399        self
1400    }
1401
1402    /// Sets number of values.
1403    pub fn set_num_values(mut self, value: i64) -> Self {
1404        self.0.num_values = value;
1405        self
1406    }
1407
1408    /// Sets compression.
1409    pub fn set_compression(mut self, value: Compression) -> Self {
1410        self.0.compression = value;
1411        self
1412    }
1413
1414    /// Sets total compressed size in bytes.
1415    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1416        self.0.total_compressed_size = value;
1417        self
1418    }
1419
1420    /// Sets total uncompressed size in bytes.
1421    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1422        self.0.total_uncompressed_size = value;
1423        self
1424    }
1425
1426    /// Sets data page offset in bytes.
1427    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1428        self.0.data_page_offset = value;
1429        self
1430    }
1431
1432    /// Sets optional dictionary page offset in bytes.
1433    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1434        self.0.dictionary_page_offset = value;
1435        self
1436    }
1437
1438    /// Sets optional index page offset in bytes.
1439    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1440        self.0.index_page_offset = value;
1441        self
1442    }
1443
1444    /// Sets statistics for this column chunk.
1445    pub fn set_statistics(mut self, value: Statistics) -> Self {
1446        self.0.statistics = Some(value);
1447        self
1448    }
1449
1450    /// Sets geospatial statistics for this column chunk.
1451    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1452        self.0.geo_statistics = Some(value);
1453        self
1454    }
1455
1456    /// Clears the statistics for this column chunk.
1457    pub fn clear_statistics(mut self) -> Self {
1458        self.0.statistics = None;
1459        self
1460    }
1461
1462    /// Sets page encoding stats for this column chunk.
1463    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1464        self.0.encoding_stats = Some(value);
1465        self
1466    }
1467
1468    /// Clears the page encoding stats for this column chunk.
1469    pub fn clear_page_encoding_stats(mut self) -> Self {
1470        self.0.encoding_stats = None;
1471        self
1472    }
1473
1474    /// Sets optional bloom filter offset in bytes.
1475    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1476        self.0.bloom_filter_offset = value;
1477        self
1478    }
1479
1480    /// Sets optional bloom filter length in bytes.
1481    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1482        self.0.bloom_filter_length = value;
1483        self
1484    }
1485
1486    /// Sets optional offset index offset in bytes.
1487    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1488        self.0.offset_index_offset = value;
1489        self
1490    }
1491
1492    /// Sets optional offset index length in bytes.
1493    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1494        self.0.offset_index_length = value;
1495        self
1496    }
1497
1498    /// Sets optional column index offset in bytes.
1499    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1500        self.0.column_index_offset = value;
1501        self
1502    }
1503
1504    /// Sets optional column index length in bytes.
1505    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1506        self.0.column_index_length = value;
1507        self
1508    }
1509
1510    /// Sets optional length of variable length data in bytes.
1511    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1512        self.0.unencoded_byte_array_data_bytes = value;
1513        self
1514    }
1515
1516    /// Sets optional repetition level histogram
1517    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1518        self.0.repetition_level_histogram = value;
1519        self
1520    }
1521
1522    /// Sets optional repetition level histogram
1523    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1524        self.0.definition_level_histogram = value;
1525        self
1526    }
1527
1528    #[cfg(feature = "encryption")]
1529    /// Set the encryption metadata for an encrypted column
1530    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1531        self.0.column_crypto_metadata = value;
1532        self
1533    }
1534
1535    /// Builds column chunk metadata.
1536    pub fn build(self) -> Result<ColumnChunkMetaData> {
1537        Ok(self.0)
1538    }
1539}
1540
1541/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1542///
1543/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1544pub struct ColumnIndexBuilder {
1545    null_pages: Vec<bool>,
1546    min_values: Vec<Vec<u8>>,
1547    max_values: Vec<Vec<u8>>,
1548    null_counts: Vec<i64>,
1549    boundary_order: BoundaryOrder,
1550    /// contains the concatenation of the histograms of all pages
1551    repetition_level_histograms: Option<Vec<i64>>,
1552    /// contains the concatenation of the histograms of all pages
1553    definition_level_histograms: Option<Vec<i64>>,
1554    /// Is the information in the builder valid?
1555    ///
1556    /// Set to `false` if any entry in the page doesn't have statistics for
1557    /// some reason, so statistics for that page won't be written to the file.
1558    /// This might happen if the page is entirely null, or
1559    /// is a floating point column without any non-nan values
1560    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1561    valid: bool,
1562}
1563
1564impl Default for ColumnIndexBuilder {
1565    fn default() -> Self {
1566        Self::new()
1567    }
1568}
1569
1570impl ColumnIndexBuilder {
1571    /// Creates a new column index builder.
1572    pub fn new() -> Self {
1573        ColumnIndexBuilder {
1574            null_pages: Vec::new(),
1575            min_values: Vec::new(),
1576            max_values: Vec::new(),
1577            null_counts: Vec::new(),
1578            boundary_order: BoundaryOrder::UNORDERED,
1579            repetition_level_histograms: None,
1580            definition_level_histograms: None,
1581            valid: true,
1582        }
1583    }
1584
1585    /// Append statistics for the next page
1586    pub fn append(
1587        &mut self,
1588        null_page: bool,
1589        min_value: Vec<u8>,
1590        max_value: Vec<u8>,
1591        null_count: i64,
1592    ) {
1593        self.null_pages.push(null_page);
1594        self.min_values.push(min_value);
1595        self.max_values.push(max_value);
1596        self.null_counts.push(null_count);
1597    }
1598
1599    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1600    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1601    pub fn append_histograms(
1602        &mut self,
1603        repetition_level_histogram: &Option<LevelHistogram>,
1604        definition_level_histogram: &Option<LevelHistogram>,
1605    ) {
1606        if !self.valid {
1607            return;
1608        }
1609        if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1610            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1611            hist.reserve(rep_lvl_hist.len());
1612            hist.extend(rep_lvl_hist.values());
1613        }
1614        if let Some(ref def_lvl_hist) = definition_level_histogram {
1615            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1616            hist.reserve(def_lvl_hist.len());
1617            hist.extend(def_lvl_hist.values());
1618        }
1619    }
1620
1621    /// Set the boundary order of the column index
1622    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1623        self.boundary_order = boundary_order;
1624    }
1625
1626    /// Mark this column index as invalid
1627    pub fn to_invalid(&mut self) {
1628        self.valid = false;
1629    }
1630
1631    /// Is the information in the builder valid?
1632    pub fn valid(&self) -> bool {
1633        self.valid
1634    }
1635
1636    /// Build and get the thrift metadata of column index
1637    ///
1638    /// Note: callers should check [`Self::valid`] before calling this method
1639    pub fn build_to_thrift(self) -> ColumnIndex {
1640        ColumnIndex::new(
1641            self.null_pages,
1642            self.min_values,
1643            self.max_values,
1644            self.boundary_order,
1645            self.null_counts,
1646            self.repetition_level_histograms,
1647            self.definition_level_histograms,
1648        )
1649    }
1650}
1651
1652impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1653    fn from(value: ColumnChunkMetaData) -> Self {
1654        ColumnChunkMetaDataBuilder(value)
1655    }
1656}
1657
1658/// Builder for offset index, part of the Parquet [PageIndex].
1659///
1660/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1661pub struct OffsetIndexBuilder {
1662    offset_array: Vec<i64>,
1663    compressed_page_size_array: Vec<i32>,
1664    first_row_index_array: Vec<i64>,
1665    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1666    current_first_row_index: i64,
1667}
1668
1669impl Default for OffsetIndexBuilder {
1670    fn default() -> Self {
1671        Self::new()
1672    }
1673}
1674
1675impl OffsetIndexBuilder {
1676    /// Creates a new offset index builder.
1677    pub fn new() -> Self {
1678        OffsetIndexBuilder {
1679            offset_array: Vec::new(),
1680            compressed_page_size_array: Vec::new(),
1681            first_row_index_array: Vec::new(),
1682            unencoded_byte_array_data_bytes_array: None,
1683            current_first_row_index: 0,
1684        }
1685    }
1686
1687    /// Append the row count of the next page.
1688    pub fn append_row_count(&mut self, row_count: i64) {
1689        let current_page_row_index = self.current_first_row_index;
1690        self.first_row_index_array.push(current_page_row_index);
1691        self.current_first_row_index += row_count;
1692    }
1693
1694    /// Append the offset and size of the next page.
1695    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1696        self.offset_array.push(offset);
1697        self.compressed_page_size_array.push(compressed_page_size);
1698    }
1699
1700    /// Append the unencoded byte array data bytes of the next page.
1701    pub fn append_unencoded_byte_array_data_bytes(
1702        &mut self,
1703        unencoded_byte_array_data_bytes: Option<i64>,
1704    ) {
1705        if let Some(val) = unencoded_byte_array_data_bytes {
1706            self.unencoded_byte_array_data_bytes_array
1707                .get_or_insert(Vec::new())
1708                .push(val);
1709        }
1710    }
1711
1712    /// Build and get the thrift metadata of offset index
1713    pub fn build_to_thrift(self) -> OffsetIndex {
1714        let locations = self
1715            .offset_array
1716            .iter()
1717            .zip(self.compressed_page_size_array.iter())
1718            .zip(self.first_row_index_array.iter())
1719            .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1720            .collect::<Vec<_>>();
1721        OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1722    }
1723}
1724
1725#[cfg(test)]
1726mod tests {
1727    use super::*;
1728    use crate::basic::{PageType, SortOrder};
1729    use crate::file::page_index::index::NativeIndex;
1730
1731    #[test]
1732    fn test_row_group_metadata_thrift_conversion() {
1733        let schema_descr = get_test_schema_descr();
1734
1735        let mut columns = vec![];
1736        for ptr in schema_descr.columns() {
1737            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1738            columns.push(column);
1739        }
1740        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1741            .set_num_rows(1000)
1742            .set_total_byte_size(2000)
1743            .set_column_metadata(columns)
1744            .set_ordinal(1)
1745            .build()
1746            .unwrap();
1747
1748        let row_group_exp = row_group_meta.to_thrift();
1749        let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1750            .unwrap()
1751            .to_thrift();
1752
1753        assert_eq!(row_group_res, row_group_exp);
1754    }
1755
1756    #[test]
1757    fn test_row_group_metadata_thrift_conversion_empty() {
1758        let schema_descr = get_test_schema_descr();
1759
1760        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1761
1762        assert!(row_group_meta.is_err());
1763        if let Err(e) = row_group_meta {
1764            assert_eq!(
1765                format!("{e}"),
1766                "Parquet error: Column length mismatch: 2 != 0"
1767            );
1768        }
1769    }
1770
1771    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1772    #[test]
1773    fn test_row_group_metadata_thrift_corrupted() {
1774        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1775            SchemaType::group_type_builder("schema")
1776                .with_fields(vec![
1777                    Arc::new(
1778                        SchemaType::primitive_type_builder("a", Type::INT32)
1779                            .build()
1780                            .unwrap(),
1781                    ),
1782                    Arc::new(
1783                        SchemaType::primitive_type_builder("b", Type::INT32)
1784                            .build()
1785                            .unwrap(),
1786                    ),
1787                ])
1788                .build()
1789                .unwrap(),
1790        )));
1791
1792        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1793            SchemaType::group_type_builder("schema")
1794                .with_fields(vec![
1795                    Arc::new(
1796                        SchemaType::primitive_type_builder("a", Type::INT32)
1797                            .build()
1798                            .unwrap(),
1799                    ),
1800                    Arc::new(
1801                        SchemaType::primitive_type_builder("b", Type::INT32)
1802                            .build()
1803                            .unwrap(),
1804                    ),
1805                    Arc::new(
1806                        SchemaType::primitive_type_builder("c", Type::INT32)
1807                            .build()
1808                            .unwrap(),
1809                    ),
1810                ])
1811                .build()
1812                .unwrap(),
1813        )));
1814
1815        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1816            .set_num_rows(1000)
1817            .set_total_byte_size(2000)
1818            .set_column_metadata(vec![
1819                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1820                    .build()
1821                    .unwrap(),
1822                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1823                    .build()
1824                    .unwrap(),
1825            ])
1826            .set_ordinal(1)
1827            .build()
1828            .unwrap();
1829
1830        let err =
1831            RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1832                .unwrap_err()
1833                .to_string();
1834        assert_eq!(
1835            err,
1836            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1837        );
1838    }
1839
1840    #[test]
1841    fn test_column_chunk_metadata_thrift_conversion() {
1842        let column_descr = get_test_schema_descr().column(0);
1843
1844        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1845            .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1846            .set_file_path("file_path".to_owned())
1847            .set_num_values(1000)
1848            .set_compression(Compression::SNAPPY)
1849            .set_total_compressed_size(2000)
1850            .set_total_uncompressed_size(3000)
1851            .set_data_page_offset(4000)
1852            .set_dictionary_page_offset(Some(5000))
1853            .set_page_encoding_stats(vec![
1854                PageEncodingStats {
1855                    page_type: PageType::DATA_PAGE,
1856                    encoding: Encoding::PLAIN,
1857                    count: 3,
1858                },
1859                PageEncodingStats {
1860                    page_type: PageType::DATA_PAGE,
1861                    encoding: Encoding::RLE,
1862                    count: 5,
1863                },
1864            ])
1865            .set_bloom_filter_offset(Some(6000))
1866            .set_bloom_filter_length(Some(25))
1867            .set_offset_index_offset(Some(7000))
1868            .set_offset_index_length(Some(25))
1869            .set_column_index_offset(Some(8000))
1870            .set_column_index_length(Some(25))
1871            .set_unencoded_byte_array_data_bytes(Some(2000))
1872            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1873            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1874            .build()
1875            .unwrap();
1876
1877        let col_chunk_res =
1878            ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1879
1880        assert_eq!(col_chunk_res, col_metadata);
1881    }
1882
1883    #[test]
1884    fn test_column_chunk_metadata_thrift_conversion_empty() {
1885        let column_descr = get_test_schema_descr().column(0);
1886
1887        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1888            .build()
1889            .unwrap();
1890
1891        let col_chunk_exp = col_metadata.to_thrift();
1892        let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1893            .unwrap()
1894            .to_thrift();
1895
1896        assert_eq!(col_chunk_res, col_chunk_exp);
1897    }
1898
1899    #[test]
1900    fn test_compressed_size() {
1901        let schema_descr = get_test_schema_descr();
1902
1903        let mut columns = vec![];
1904        for column_descr in schema_descr.columns() {
1905            let column = ColumnChunkMetaData::builder(column_descr.clone())
1906                .set_total_compressed_size(500)
1907                .set_total_uncompressed_size(700)
1908                .build()
1909                .unwrap();
1910            columns.push(column);
1911        }
1912        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1913            .set_num_rows(1000)
1914            .set_column_metadata(columns)
1915            .build()
1916            .unwrap();
1917
1918        let compressed_size_res: i64 = row_group_meta.compressed_size();
1919        let compressed_size_exp: i64 = 1000;
1920
1921        assert_eq!(compressed_size_res, compressed_size_exp);
1922    }
1923
1924    #[test]
1925    fn test_memory_size() {
1926        let schema_descr = get_test_schema_descr();
1927
1928        let columns = schema_descr
1929            .columns()
1930            .iter()
1931            .map(|column_descr| {
1932                ColumnChunkMetaData::builder(column_descr.clone())
1933                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1934                    .build()
1935            })
1936            .collect::<Result<Vec<_>>>()
1937            .unwrap();
1938        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1939            .set_num_rows(1000)
1940            .set_column_metadata(columns)
1941            .build()
1942            .unwrap();
1943        let row_group_meta = vec![row_group_meta];
1944
1945        let version = 2;
1946        let num_rows = 1000;
1947        let created_by = Some(String::from("test harness"));
1948        let key_value_metadata = Some(vec![KeyValue::new(
1949            String::from("Foo"),
1950            Some(String::from("bar")),
1951        )]);
1952        let column_orders = Some(vec![
1953            ColumnOrder::UNDEFINED,
1954            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1955        ]);
1956        let file_metadata = FileMetaData::new(
1957            version,
1958            num_rows,
1959            created_by,
1960            key_value_metadata,
1961            schema_descr.clone(),
1962            column_orders,
1963        );
1964
1965        // Now, add in Exact Statistics
1966        let columns_with_stats = schema_descr
1967            .columns()
1968            .iter()
1969            .map(|column_descr| {
1970                ColumnChunkMetaData::builder(column_descr.clone())
1971                    .set_statistics(Statistics::new::<i32>(
1972                        Some(0),
1973                        Some(100),
1974                        None,
1975                        None,
1976                        false,
1977                    ))
1978                    .build()
1979            })
1980            .collect::<Result<Vec<_>>>()
1981            .unwrap();
1982
1983        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1984            .set_num_rows(1000)
1985            .set_column_metadata(columns_with_stats)
1986            .build()
1987            .unwrap();
1988        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1989
1990        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1991            .set_row_groups(row_group_meta_with_stats)
1992            .build();
1993
1994        #[cfg(not(feature = "encryption"))]
1995        let base_expected_size = 2344;
1996        #[cfg(feature = "encryption")]
1997        let base_expected_size = 2680;
1998
1999        assert_eq!(parquet_meta.memory_size(), base_expected_size);
2000
2001        let mut column_index = ColumnIndexBuilder::new();
2002        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2003        let column_index = column_index.build_to_thrift();
2004        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
2005
2006        // Now, add in OffsetIndex
2007        let mut offset_index = OffsetIndexBuilder::new();
2008        offset_index.append_row_count(1);
2009        offset_index.append_offset_and_size(2, 3);
2010        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2011        offset_index.append_row_count(1);
2012        offset_index.append_offset_and_size(2, 3);
2013        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2014        let offset_index = offset_index.build_to_thrift();
2015
2016        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2017            .set_row_groups(row_group_meta)
2018            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
2019            .set_offset_index(Some(vec![vec![
2020                OffsetIndexMetaData::try_new(offset_index).unwrap()
2021            ]]))
2022            .build();
2023
2024        #[cfg(not(feature = "encryption"))]
2025        let bigger_expected_size = 2848;
2026        #[cfg(feature = "encryption")]
2027        let bigger_expected_size = 3184;
2028
2029        // more set fields means more memory usage
2030        assert!(bigger_expected_size > base_expected_size);
2031        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2032    }
2033
2034    /// Returns sample schema descriptor so we can create column metadata.
2035    fn get_test_schema_descr() -> SchemaDescPtr {
2036        let schema = SchemaType::group_type_builder("schema")
2037            .with_fields(vec![
2038                Arc::new(
2039                    SchemaType::primitive_type_builder("a", Type::INT32)
2040                        .build()
2041                        .unwrap(),
2042                ),
2043                Arc::new(
2044                    SchemaType::primitive_type_builder("b", Type::INT32)
2045                        .build()
2046                        .unwrap(),
2047                ),
2048            ])
2049            .build()
2050            .unwrap();
2051
2052        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2053    }
2054}