parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Most users should use these structures to interact with Parquet metadata.
21//! The [crate::format] module contains lower level structures generated from the
22//! Parquet thrift definition.
23//!
24//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
25//!   file footer.
26//!
27//! * [`FileMetaData`]: File level metadata such as schema, row counts and
28//!   version.
29//!
30//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
31//!   location and number of rows, and column chunks.
32//!
33//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
34//!   within a Row Group including encoding and compression information,
35//!   number of values, statistics, etc.
36//!
37//! # APIs for working with Parquet Metadata
38//!
39//! The Parquet readers and writers in this crate handle reading and writing
40//! metadata into parquet files. To work with metadata directly,
41//! the following APIs are available:
42//!
43//! * [`ParquetMetaDataReader`] for reading
44//! * [`ParquetMetaDataWriter`] for writing.
45//!
46//! [`ParquetMetaDataReader`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html
47//! [`ParquetMetaDataWriter`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataWriter.html
48//!
49//! # Examples
50//!
51//! Please see [`external_metadata.rs`]
52//!
53//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
54//!
55//! # Metadata Encodings and Structures
56//!
57//! There are three different encodings of Parquet Metadata in this crate:
58//!
59//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
60//!    [parquet.thrift]
61//!
62//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
63//!    from [parquet.thrift]. These structures are low level and mirror
64//!    the thrift definitions.
65//!
66//! 3. [`file::metadata`] (this module): Easier to use Rust structures
67//!    with a more idiomatic API. Note that, confusingly, some but not all
68//!    of these structures have the same name as the [`format`] structures.
69//!
70//! [`format`]: crate::format
71//! [`file::metadata`]: crate::file::metadata
72//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
73//!
74//! Graphically, this is how the different structures relate to each other:
75//!
76//! ```text
77//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
78//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
79//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
80//!                            └──────────────┘     │         └───────────────────────┘ │
81//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
82//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
83//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
84//!                                     ...         │                   ...             │
85//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
86//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
87//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
88//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
89//!
90//!                          format::meta structures          file::metadata structures
91//!
92//!                         * Same name, different struct
93//! ```
94mod memory;
95pub(crate) mod reader;
96mod writer;
97
98use crate::basic::{ColumnOrder, Compression, Encoding, Type};
99#[cfg(feature = "encryption")]
100use crate::encryption::{
101    decrypt::FileDecryptor,
102    modules::{create_module_aad, ModuleType},
103};
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
107pub(crate) use crate::file::metadata::memory::HeapSize;
108use crate::file::page_encoding_stats::{self, PageEncodingStats};
109use crate::file::page_index::index::Index;
110use crate::file::page_index::offset_index::OffsetIndexMetaData;
111use crate::file::statistics::{self, Statistics};
112use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
113use crate::format::{
114    BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
115    SizeStatistics, SortingColumn,
116};
117use crate::schema::types::{
118    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
119    Type as SchemaType,
120};
121#[cfg(feature = "encryption")]
122use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
123pub use reader::ParquetMetaDataReader;
124use std::ops::Range;
125use std::sync::Arc;
126pub use writer::ParquetMetaDataWriter;
127pub(crate) use writer::ThriftMetadataWriter;
128
129/// Page level statistics for each column chunk of each row group.
130///
131/// This structure is an in-memory representation of multiple [`ColumnIndex`]
132/// structures in a parquet file footer, as described in the Parquet [PageIndex
133/// documentation]. Each [`Index`] holds statistics about all the pages in a
134/// particular column chunk.
135///
136/// `column_index[row_group_number][column_number]` holds the
137/// [`Index`] corresponding to column `column_number` of row group
138/// `row_group_number`.
139///
140/// For example `column_index[2][3]` holds the [`Index`] for the fourth
141/// column in the third row group of the parquet file.
142///
143/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
144pub type ParquetColumnIndex = Vec<Vec<Index>>;
145
146/// [`OffsetIndexMetaData`] for each data page of each row group of each column
147///
148/// This structure is the parsed representation of the [`OffsetIndex`] from the
149/// Parquet file footer, as described in the Parquet [PageIndex documentation].
150///
151/// `offset_index[row_group_number][column_number]` holds
152/// the [`OffsetIndexMetaData`] corresponding to column
153/// `column_number`of row group `row_group_number`.
154///
155/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
156pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
157
158/// Parsed metadata for a single Parquet file
159///
160/// This structure is stored in the footer of Parquet files, in the format
161/// defined by [`parquet.thrift`].
162///
163/// # Overview
164/// The fields of this structure are:
165/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
166/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
167/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
168///
169/// This structure is read by the various readers in this crate or can be read
170/// directly from a file using the [`ParquetMetaDataReader`] struct.
171///
172/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
173///
174/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
175#[derive(Debug, Clone, PartialEq)]
176pub struct ParquetMetaData {
177    /// File level metadata
178    file_metadata: FileMetaData,
179    /// Row group metadata
180    row_groups: Vec<RowGroupMetaData>,
181    /// Page level index for each page in each column chunk
182    column_index: Option<ParquetColumnIndex>,
183    /// Offset index for each page in each column chunk
184    offset_index: Option<ParquetOffsetIndex>,
185    /// Optional file decryptor
186    #[cfg(feature = "encryption")]
187    file_decryptor: Option<FileDecryptor>,
188}
189
190impl ParquetMetaData {
191    /// Creates Parquet metadata from file metadata and a list of row
192    /// group metadata
193    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
194        ParquetMetaData {
195            file_metadata,
196            row_groups,
197            #[cfg(feature = "encryption")]
198            file_decryptor: None,
199            column_index: None,
200            offset_index: None,
201        }
202    }
203
204    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
205    /// encrypted data.
206    #[cfg(feature = "encryption")]
207    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
208        self.file_decryptor = file_decryptor;
209    }
210
211    /// Creates Parquet metadata from file metadata, a list of row
212    /// group metadata, and the column index structures.
213    #[deprecated(since = "53.1.0", note = "Use ParquetMetaDataBuilder")]
214    pub fn new_with_page_index(
215        file_metadata: FileMetaData,
216        row_groups: Vec<RowGroupMetaData>,
217        column_index: Option<ParquetColumnIndex>,
218        offset_index: Option<ParquetOffsetIndex>,
219    ) -> Self {
220        ParquetMetaDataBuilder::new(file_metadata)
221            .set_row_groups(row_groups)
222            .set_column_index(column_index)
223            .set_offset_index(offset_index)
224            .build()
225    }
226
227    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
228    pub fn into_builder(self) -> ParquetMetaDataBuilder {
229        self.into()
230    }
231
232    /// Returns file metadata as reference.
233    pub fn file_metadata(&self) -> &FileMetaData {
234        &self.file_metadata
235    }
236
237    /// Returns file decryptor as reference.
238    #[cfg(feature = "encryption")]
239    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
240        self.file_decryptor.as_ref()
241    }
242
243    /// Returns number of row groups in this file.
244    pub fn num_row_groups(&self) -> usize {
245        self.row_groups.len()
246    }
247
248    /// Returns row group metadata for `i`th position.
249    /// Position should be less than number of row groups `num_row_groups`.
250    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
251        &self.row_groups[i]
252    }
253
254    /// Returns slice of row groups in this file.
255    pub fn row_groups(&self) -> &[RowGroupMetaData] {
256        &self.row_groups
257    }
258
259    /// Returns the column index for this file if loaded
260    ///
261    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
262    /// [ArrowReaderOptions::with_page_index] was set to false.
263    ///
264    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
265    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
266        self.column_index.as_ref()
267    }
268
269    /// Returns offset indexes in this file, if loaded
270    ///
271    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
272    /// [ArrowReaderOptions::with_page_index] was set to false.
273    ///
274    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
275    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
276        self.offset_index.as_ref()
277    }
278
279    /// Estimate of the bytes allocated to store `ParquetMetadata`
280    ///
281    /// # Notes:
282    ///
283    /// 1. Includes size of self
284    ///
285    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
286    ///    [`RowGroupMetaData`].
287    ///
288    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
289    ///    means `memory_size` will over estimate the memory size if such pointers
290    ///    are shared.
291    ///
292    /// 4. Does not include any allocator overheads
293    pub fn memory_size(&self) -> usize {
294        std::mem::size_of::<Self>()
295            + self.file_metadata.heap_size()
296            + self.row_groups.heap_size()
297            + self.column_index.heap_size()
298            + self.offset_index.heap_size()
299    }
300
301    /// Override the column index
302    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
303        self.column_index = index;
304    }
305
306    /// Override the offset index
307    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
308        self.offset_index = index;
309    }
310}
311
312/// A builder for creating / manipulating [`ParquetMetaData`]
313///
314/// # Example creating a new [`ParquetMetaData`]
315///
316///```no_run
317/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
318/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
319/// // Create a new builder given the file metadata
320/// let file_metadata = get_file_metadata();
321/// // Create a row group
322/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
323///    .set_num_rows(100)
324///    // ... (A real row group needs more than just the number of rows)
325///    .build()
326///    .unwrap();
327/// // Create the final metadata
328/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
329///   .add_row_group(row_group)
330///   .build();
331/// ```
332///
333/// # Example modifying an existing [`ParquetMetaData`]
334/// ```no_run
335/// # use parquet::file::metadata::ParquetMetaData;
336/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
337/// // Modify the metadata so only the last RowGroup remains
338/// let metadata: ParquetMetaData = load_metadata();
339/// let mut builder = metadata.into_builder();
340///
341/// // Take existing row groups to modify
342/// let mut row_groups = builder.take_row_groups();
343/// let last_row_group = row_groups.pop().unwrap();
344///
345/// let metadata = builder
346///   .add_row_group(last_row_group)
347///   .build();
348/// ```
349pub struct ParquetMetaDataBuilder(ParquetMetaData);
350
351impl ParquetMetaDataBuilder {
352    /// Create a new builder from a file metadata, with no row groups
353    pub fn new(file_meta_data: FileMetaData) -> Self {
354        Self(ParquetMetaData::new(file_meta_data, vec![]))
355    }
356
357    /// Create a new builder from an existing ParquetMetaData
358    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
359        Self(metadata)
360    }
361
362    /// Adds a row group to the metadata
363    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
364        self.0.row_groups.push(row_group);
365        self
366    }
367
368    /// Sets all the row groups to the specified list
369    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
370        self.0.row_groups = row_groups;
371        self
372    }
373
374    /// Takes ownership of the row groups in this builder, and clears the list
375    /// of row groups.
376    ///
377    /// This can be used for more efficient creation of a new ParquetMetaData
378    /// from an existing one.
379    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
380        std::mem::take(&mut self.0.row_groups)
381    }
382
383    /// Return a reference to the current row groups
384    pub fn row_groups(&self) -> &[RowGroupMetaData] {
385        &self.0.row_groups
386    }
387
388    /// Sets the column index
389    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
390        self.0.column_index = column_index;
391        self
392    }
393
394    /// Returns the current column index from the builder, replacing it with `None`
395    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
396        std::mem::take(&mut self.0.column_index)
397    }
398
399    /// Return a reference to the current column index, if any
400    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
401        self.0.column_index.as_ref()
402    }
403
404    /// Sets the offset index
405    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
406        self.0.offset_index = offset_index;
407        self
408    }
409
410    /// Returns the current offset index from the builder, replacing it with `None`
411    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
412        std::mem::take(&mut self.0.offset_index)
413    }
414
415    /// Return a reference to the current offset index, if any
416    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
417        self.0.offset_index.as_ref()
418    }
419
420    /// Creates a new ParquetMetaData from the builder
421    pub fn build(self) -> ParquetMetaData {
422        let Self(metadata) = self;
423        metadata
424    }
425}
426
427impl From<ParquetMetaData> for ParquetMetaDataBuilder {
428    fn from(meta_data: ParquetMetaData) -> Self {
429        Self(meta_data)
430    }
431}
432
433/// A key-value pair for [`FileMetaData`].
434pub type KeyValue = crate::format::KeyValue;
435
436/// Reference counted pointer for [`FileMetaData`].
437pub type FileMetaDataPtr = Arc<FileMetaData>;
438
439/// File level metadata for a Parquet file.
440///
441/// Includes the version of the file, metadata, number of rows, schema, and column orders
442#[derive(Debug, Clone, PartialEq)]
443pub struct FileMetaData {
444    version: i32,
445    num_rows: i64,
446    created_by: Option<String>,
447    key_value_metadata: Option<Vec<KeyValue>>,
448    schema_descr: SchemaDescPtr,
449    column_orders: Option<Vec<ColumnOrder>>,
450}
451
452impl FileMetaData {
453    /// Creates new file metadata.
454    pub fn new(
455        version: i32,
456        num_rows: i64,
457        created_by: Option<String>,
458        key_value_metadata: Option<Vec<KeyValue>>,
459        schema_descr: SchemaDescPtr,
460        column_orders: Option<Vec<ColumnOrder>>,
461    ) -> Self {
462        FileMetaData {
463            version,
464            num_rows,
465            created_by,
466            key_value_metadata,
467            schema_descr,
468            column_orders,
469        }
470    }
471
472    /// Returns version of this file.
473    pub fn version(&self) -> i32 {
474        self.version
475    }
476
477    /// Returns number of rows in the file.
478    pub fn num_rows(&self) -> i64 {
479        self.num_rows
480    }
481
482    /// String message for application that wrote this file.
483    ///
484    /// This should have the following format:
485    /// `<application> version <application version> (build <application build hash>)`.
486    ///
487    /// ```shell
488    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
489    /// ```
490    pub fn created_by(&self) -> Option<&str> {
491        self.created_by.as_deref()
492    }
493
494    /// Returns key_value_metadata of this file.
495    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
496        self.key_value_metadata.as_ref()
497    }
498
499    /// Returns Parquet [`Type`] that describes schema in this file.
500    ///
501    /// [`Type`]: crate::schema::types::Type
502    pub fn schema(&self) -> &SchemaType {
503        self.schema_descr.root_schema()
504    }
505
506    /// Returns a reference to schema descriptor.
507    pub fn schema_descr(&self) -> &SchemaDescriptor {
508        &self.schema_descr
509    }
510
511    /// Returns reference counted clone for schema descriptor.
512    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
513        self.schema_descr.clone()
514    }
515
516    /// Column (sort) order used for `min` and `max` values of each column in this file.
517    ///
518    /// Each column order corresponds to one column, determined by its position in the
519    /// list, matching the position of the column in the schema.
520    ///
521    /// When `None` is returned, there are no column orders available, and each column
522    /// should be assumed to have undefined (legacy) column order.
523    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
524        self.column_orders.as_ref()
525    }
526
527    /// Returns column order for `i`th column in this file.
528    /// If column orders are not available, returns undefined (legacy) column order.
529    pub fn column_order(&self, i: usize) -> ColumnOrder {
530        self.column_orders
531            .as_ref()
532            .map(|data| data[i])
533            .unwrap_or(ColumnOrder::UNDEFINED)
534    }
535}
536
537/// Reference counted pointer for [`RowGroupMetaData`].
538pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
539
540/// Metadata for a row group
541///
542/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
543/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
544#[derive(Debug, Clone, PartialEq)]
545pub struct RowGroupMetaData {
546    columns: Vec<ColumnChunkMetaData>,
547    num_rows: i64,
548    sorting_columns: Option<Vec<SortingColumn>>,
549    total_byte_size: i64,
550    schema_descr: SchemaDescPtr,
551    /// We can't infer from file offset of first column since there may empty columns in row group.
552    file_offset: Option<i64>,
553    /// Ordinal position of this row group in file
554    ordinal: Option<i16>,
555}
556
557impl RowGroupMetaData {
558    /// Returns builder for row group metadata.
559    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
560        RowGroupMetaDataBuilder::new(schema_descr)
561    }
562
563    /// Number of columns in this row group.
564    pub fn num_columns(&self) -> usize {
565        self.columns.len()
566    }
567
568    /// Returns column chunk metadata for `i`th column.
569    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
570        &self.columns[i]
571    }
572
573    /// Returns slice of column chunk metadata.
574    pub fn columns(&self) -> &[ColumnChunkMetaData] {
575        &self.columns
576    }
577
578    /// Returns mutable slice of column chunk metadata.
579    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
580        &mut self.columns
581    }
582
583    /// Number of rows in this row group.
584    pub fn num_rows(&self) -> i64 {
585        self.num_rows
586    }
587
588    /// Returns the sort ordering of the rows in this RowGroup if any
589    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
590        self.sorting_columns.as_ref()
591    }
592
593    /// Total byte size of all uncompressed column data in this row group.
594    pub fn total_byte_size(&self) -> i64 {
595        self.total_byte_size
596    }
597
598    /// Total size of all compressed column data in this row group.
599    pub fn compressed_size(&self) -> i64 {
600        self.columns.iter().map(|c| c.total_compressed_size).sum()
601    }
602
603    /// Returns reference to a schema descriptor.
604    pub fn schema_descr(&self) -> &SchemaDescriptor {
605        self.schema_descr.as_ref()
606    }
607
608    /// Returns reference counted clone of schema descriptor.
609    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
610        self.schema_descr.clone()
611    }
612
613    /// Returns ordinal position of this row group in file.
614    ///
615    /// For example if this is the first row group in the file, this will return 0.
616    /// If this is the second row group in the file, this will return 1.
617    #[inline(always)]
618    pub fn ordinal(&self) -> Option<i16> {
619        self.ordinal
620    }
621
622    /// Returns file offset of this row group in file.
623    #[inline(always)]
624    pub fn file_offset(&self) -> Option<i64> {
625        self.file_offset
626    }
627
628    /// Method to convert from encrypted Thrift.
629    #[cfg(feature = "encryption")]
630    fn from_encrypted_thrift(
631        schema_descr: SchemaDescPtr,
632        mut rg: RowGroup,
633        decryptor: Option<&FileDecryptor>,
634    ) -> Result<RowGroupMetaData> {
635        if schema_descr.num_columns() != rg.columns.len() {
636            return Err(general_err!(
637                "Column count mismatch. Schema has {} columns while Row Group has {}",
638                schema_descr.num_columns(),
639                rg.columns.len()
640            ));
641        }
642        let total_byte_size = rg.total_byte_size;
643        let num_rows = rg.num_rows;
644        let mut columns = vec![];
645
646        for (i, (mut c, d)) in rg
647            .columns
648            .drain(0..)
649            .zip(schema_descr.columns())
650            .enumerate()
651        {
652            // Read encrypted metadata if it's present and we have a decryptor.
653            if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
654                let column_decryptor = match c.crypto_metadata.as_ref() {
655                    None => {
656                        return Err(general_err!(
657                            "No crypto_metadata is set for column '{}', which has encrypted metadata",
658                            d.path().string()
659                        ));
660                    }
661                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
662                        let column_name = crypto_metadata.path_in_schema.join(".");
663                        decryptor.get_column_metadata_decryptor(
664                            column_name.as_str(),
665                            crypto_metadata.key_metadata.as_deref(),
666                        )?
667                    }
668                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
669                        decryptor.get_footer_decryptor()?
670                    }
671                };
672
673                let column_aad = create_module_aad(
674                    decryptor.file_aad(),
675                    ModuleType::ColumnMetaData,
676                    rg.ordinal.unwrap() as usize,
677                    i,
678                    None,
679                )?;
680
681                let buf = c.encrypted_column_metadata.clone().unwrap();
682                let decrypted_cc_buf = column_decryptor
683                    .decrypt(buf.as_slice(), column_aad.as_ref())
684                    .map_err(|_| {
685                        general_err!(
686                            "Unable to decrypt column '{}', perhaps the column key is wrong?",
687                            d.path().string()
688                        )
689                    })?;
690
691                let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
692                c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
693            }
694            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
695        }
696
697        let sorting_columns = rg.sorting_columns;
698        Ok(RowGroupMetaData {
699            columns,
700            num_rows,
701            sorting_columns,
702            total_byte_size,
703            schema_descr,
704            file_offset: rg.file_offset,
705            ordinal: rg.ordinal,
706        })
707    }
708
709    /// Method to convert from Thrift.
710    pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
711        if schema_descr.num_columns() != rg.columns.len() {
712            return Err(general_err!(
713                "Column count mismatch. Schema has {} columns while Row Group has {}",
714                schema_descr.num_columns(),
715                rg.columns.len()
716            ));
717        }
718        let total_byte_size = rg.total_byte_size;
719        let num_rows = rg.num_rows;
720        let mut columns = vec![];
721
722        for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
723            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
724        }
725
726        let sorting_columns = rg.sorting_columns;
727        Ok(RowGroupMetaData {
728            columns,
729            num_rows,
730            sorting_columns,
731            total_byte_size,
732            schema_descr,
733            file_offset: rg.file_offset,
734            ordinal: rg.ordinal,
735        })
736    }
737
738    /// Method to convert to Thrift.
739    pub fn to_thrift(&self) -> RowGroup {
740        RowGroup {
741            columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
742            total_byte_size: self.total_byte_size,
743            num_rows: self.num_rows,
744            sorting_columns: self.sorting_columns().cloned(),
745            file_offset: self.file_offset(),
746            total_compressed_size: Some(self.compressed_size()),
747            ordinal: self.ordinal,
748        }
749    }
750
751    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
752    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
753        RowGroupMetaDataBuilder(self)
754    }
755}
756
757/// Builder for row group metadata.
758pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
759
760impl RowGroupMetaDataBuilder {
761    /// Creates new builder from schema descriptor.
762    fn new(schema_descr: SchemaDescPtr) -> Self {
763        Self(RowGroupMetaData {
764            columns: Vec::with_capacity(schema_descr.num_columns()),
765            schema_descr,
766            file_offset: None,
767            num_rows: 0,
768            sorting_columns: None,
769            total_byte_size: 0,
770            ordinal: None,
771        })
772    }
773
774    /// Sets number of rows in this row group.
775    pub fn set_num_rows(mut self, value: i64) -> Self {
776        self.0.num_rows = value;
777        self
778    }
779
780    /// Sets the sorting order for columns
781    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
782        self.0.sorting_columns = value;
783        self
784    }
785
786    /// Sets total size in bytes for this row group.
787    pub fn set_total_byte_size(mut self, value: i64) -> Self {
788        self.0.total_byte_size = value;
789        self
790    }
791
792    /// Takes ownership of the the column metadata in this builder, and clears
793    /// the list of columns.
794    ///
795    /// This can be used for more efficient creation of a new RowGroupMetaData
796    /// from an existing one.
797    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
798        std::mem::take(&mut self.0.columns)
799    }
800
801    /// Sets column metadata for this row group.
802    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
803        self.0.columns = value;
804        self
805    }
806
807    /// Adds a column metadata to this row group
808    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
809        self.0.columns.push(value);
810        self
811    }
812
813    /// Sets ordinal for this row group.
814    pub fn set_ordinal(mut self, value: i16) -> Self {
815        self.0.ordinal = Some(value);
816        self
817    }
818
819    /// Sets file offset for this row group.
820    pub fn set_file_offset(mut self, value: i64) -> Self {
821        self.0.file_offset = Some(value);
822        self
823    }
824
825    /// Builds row group metadata.
826    pub fn build(self) -> Result<RowGroupMetaData> {
827        if self.0.schema_descr.num_columns() != self.0.columns.len() {
828            return Err(general_err!(
829                "Column length mismatch: {} != {}",
830                self.0.schema_descr.num_columns(),
831                self.0.columns.len()
832            ));
833        }
834
835        Ok(self.0)
836    }
837}
838
839/// Metadata for a column chunk.
840#[derive(Debug, Clone, PartialEq)]
841pub struct ColumnChunkMetaData {
842    column_descr: ColumnDescPtr,
843    encodings: Vec<Encoding>,
844    file_path: Option<String>,
845    file_offset: i64,
846    num_values: i64,
847    compression: Compression,
848    total_compressed_size: i64,
849    total_uncompressed_size: i64,
850    data_page_offset: i64,
851    index_page_offset: Option<i64>,
852    dictionary_page_offset: Option<i64>,
853    statistics: Option<Statistics>,
854    encoding_stats: Option<Vec<PageEncodingStats>>,
855    bloom_filter_offset: Option<i64>,
856    bloom_filter_length: Option<i32>,
857    offset_index_offset: Option<i64>,
858    offset_index_length: Option<i32>,
859    column_index_offset: Option<i64>,
860    column_index_length: Option<i32>,
861    unencoded_byte_array_data_bytes: Option<i64>,
862    repetition_level_histogram: Option<LevelHistogram>,
863    definition_level_histogram: Option<LevelHistogram>,
864    #[cfg(feature = "encryption")]
865    column_crypto_metadata: Option<ColumnCryptoMetaData>,
866}
867
868/// Histograms for repetition and definition levels.
869///
870/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
871/// values at level `i`.
872///
873/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
874/// number of rows with level 1, and so on.
875///
876#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
877pub struct LevelHistogram {
878    inner: Vec<i64>,
879}
880
881impl LevelHistogram {
882    /// Creates a new level histogram data.
883    ///
884    /// Length will be `max_level + 1`.
885    ///
886    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
887    pub fn try_new(max_level: i16) -> Option<Self> {
888        if max_level > 0 {
889            Some(Self {
890                inner: vec![0; max_level as usize + 1],
891            })
892        } else {
893            None
894        }
895    }
896    /// Returns a reference to the the histogram's values.
897    pub fn values(&self) -> &[i64] {
898        &self.inner
899    }
900
901    /// Return the inner vector, consuming self
902    pub fn into_inner(self) -> Vec<i64> {
903        self.inner
904    }
905
906    /// Returns the histogram value at the given index.
907    ///
908    /// The value of `i` is the number of values with level `i`. For example,
909    /// `get(1)` returns the number of values with level 1.
910    ///
911    /// Returns `None` if the index is out of bounds.
912    pub fn get(&self, index: usize) -> Option<i64> {
913        self.inner.get(index).copied()
914    }
915
916    /// Adds the values from the other histogram to this histogram
917    ///
918    /// # Panics
919    /// If the histograms have different lengths
920    pub fn add(&mut self, other: &Self) {
921        assert_eq!(self.len(), other.len());
922        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
923            *dst += src;
924        }
925    }
926
927    /// return the length of the histogram
928    pub fn len(&self) -> usize {
929        self.inner.len()
930    }
931
932    /// returns if the histogram is empty
933    pub fn is_empty(&self) -> bool {
934        self.inner.is_empty()
935    }
936
937    /// Sets the values of all histogram levels to 0.
938    pub fn reset(&mut self) {
939        for value in self.inner.iter_mut() {
940            *value = 0;
941        }
942    }
943
944    /// Updates histogram values using provided repetition levels
945    ///
946    /// # Panics
947    /// if any of the levels is greater than the length of the histogram (
948    /// the argument supplied to [`Self::try_new`])
949    pub fn update_from_levels(&mut self, levels: &[i16]) {
950        for &level in levels {
951            self.inner[level as usize] += 1;
952        }
953    }
954}
955
956impl From<Vec<i64>> for LevelHistogram {
957    fn from(inner: Vec<i64>) -> Self {
958        Self { inner }
959    }
960}
961
962impl From<LevelHistogram> for Vec<i64> {
963    fn from(value: LevelHistogram) -> Self {
964        value.into_inner()
965    }
966}
967
968impl HeapSize for LevelHistogram {
969    fn heap_size(&self) -> usize {
970        self.inner.heap_size()
971    }
972}
973
974/// Represents common operations for a column chunk.
975impl ColumnChunkMetaData {
976    /// Returns builder for column chunk metadata.
977    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
978        ColumnChunkMetaDataBuilder::new(column_descr)
979    }
980
981    /// File where the column chunk is stored.
982    ///
983    /// If not set, assumed to belong to the same file as the metadata.
984    /// This path is relative to the current file.
985    pub fn file_path(&self) -> Option<&str> {
986        self.file_path.as_deref()
987    }
988
989    /// Byte offset of `ColumnMetaData` in `file_path()`.
990    ///
991    /// Note that the meaning of this field has been inconsistent between implementations
992    /// so its use has since been deprecated in the Parquet specification. Modern implementations
993    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
994    /// `ColumnChunk` struct.
995    pub fn file_offset(&self) -> i64 {
996        self.file_offset
997    }
998
999    /// Type of this column. Must be primitive.
1000    pub fn column_type(&self) -> Type {
1001        self.column_descr.physical_type()
1002    }
1003
1004    /// Path (or identifier) of this column.
1005    pub fn column_path(&self) -> &ColumnPath {
1006        self.column_descr.path()
1007    }
1008
1009    /// Descriptor for this column.
1010    pub fn column_descr(&self) -> &ColumnDescriptor {
1011        self.column_descr.as_ref()
1012    }
1013
1014    /// Reference counted clone of descriptor for this column.
1015    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1016        self.column_descr.clone()
1017    }
1018
1019    /// All encodings used for this column.
1020    pub fn encodings(&self) -> &Vec<Encoding> {
1021        &self.encodings
1022    }
1023
1024    /// Total number of values in this column chunk.
1025    pub fn num_values(&self) -> i64 {
1026        self.num_values
1027    }
1028
1029    /// Compression for this column.
1030    pub fn compression(&self) -> Compression {
1031        self.compression
1032    }
1033
1034    /// Returns the total compressed data size of this column chunk.
1035    pub fn compressed_size(&self) -> i64 {
1036        self.total_compressed_size
1037    }
1038
1039    /// Returns the total uncompressed data size of this column chunk.
1040    pub fn uncompressed_size(&self) -> i64 {
1041        self.total_uncompressed_size
1042    }
1043
1044    /// Returns the offset for the column data.
1045    pub fn data_page_offset(&self) -> i64 {
1046        self.data_page_offset
1047    }
1048
1049    /// Returns the offset for the index page.
1050    pub fn index_page_offset(&self) -> Option<i64> {
1051        self.index_page_offset
1052    }
1053
1054    /// Returns the offset for the dictionary page, if any.
1055    pub fn dictionary_page_offset(&self) -> Option<i64> {
1056        self.dictionary_page_offset
1057    }
1058
1059    /// Returns the offset and length in bytes of the column chunk within the file
1060    pub fn byte_range(&self) -> (u64, u64) {
1061        let col_start = match self.dictionary_page_offset() {
1062            Some(dictionary_page_offset) => dictionary_page_offset,
1063            None => self.data_page_offset(),
1064        };
1065        let col_len = self.compressed_size();
1066        assert!(
1067            col_start >= 0 && col_len >= 0,
1068            "column start and length should not be negative"
1069        );
1070        (col_start as u64, col_len as u64)
1071    }
1072
1073    /// Returns statistics that are set for this column chunk,
1074    /// or `None` if no statistics are available.
1075    pub fn statistics(&self) -> Option<&Statistics> {
1076        self.statistics.as_ref()
1077    }
1078
1079    /// Returns the offset for the page encoding stats,
1080    /// or `None` if no page encoding stats are available.
1081    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1082        self.encoding_stats.as_ref()
1083    }
1084
1085    /// Returns the offset for the bloom filter.
1086    pub fn bloom_filter_offset(&self) -> Option<i64> {
1087        self.bloom_filter_offset
1088    }
1089
1090    /// Returns the offset for the bloom filter.
1091    pub fn bloom_filter_length(&self) -> Option<i32> {
1092        self.bloom_filter_length
1093    }
1094
1095    /// Returns the offset for the column index.
1096    pub fn column_index_offset(&self) -> Option<i64> {
1097        self.column_index_offset
1098    }
1099
1100    /// Returns the offset for the column index length.
1101    pub fn column_index_length(&self) -> Option<i32> {
1102        self.column_index_length
1103    }
1104
1105    /// Returns the range for the offset index if any
1106    pub(crate) fn column_index_range(&self) -> Option<Range<usize>> {
1107        let offset = usize::try_from(self.column_index_offset?).ok()?;
1108        let length = usize::try_from(self.column_index_length?).ok()?;
1109        Some(offset..(offset + length))
1110    }
1111
1112    /// Returns the offset for the offset index.
1113    pub fn offset_index_offset(&self) -> Option<i64> {
1114        self.offset_index_offset
1115    }
1116
1117    /// Returns the offset for the offset index length.
1118    pub fn offset_index_length(&self) -> Option<i32> {
1119        self.offset_index_length
1120    }
1121
1122    /// Returns the range for the offset index if any
1123    pub(crate) fn offset_index_range(&self) -> Option<Range<usize>> {
1124        let offset = usize::try_from(self.offset_index_offset?).ok()?;
1125        let length = usize::try_from(self.offset_index_length?).ok()?;
1126        Some(offset..(offset + length))
1127    }
1128
1129    /// Returns the number of bytes of variable length data after decoding.
1130    ///
1131    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1132    /// writers.
1133    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1134        self.unencoded_byte_array_data_bytes
1135    }
1136
1137    /// Returns the repetition level histogram.
1138    ///
1139    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1140    /// `vec[0]` indicates how many rows the page contains.
1141    /// This field may not be set by older writers.
1142    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1143        self.repetition_level_histogram.as_ref()
1144    }
1145
1146    /// Returns the definition level histogram.
1147    ///
1148    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1149    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1150    /// This field may not be set by older writers.
1151    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1152        self.definition_level_histogram.as_ref()
1153    }
1154
1155    /// Returns the encryption metadata for this column chunk.
1156    #[cfg(feature = "encryption")]
1157    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1158        self.column_crypto_metadata.as_ref()
1159    }
1160
1161    /// Method to convert from Thrift.
1162    pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1163        if cc.meta_data.is_none() {
1164            return Err(general_err!("Expected to have column metadata"));
1165        }
1166        let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1167        let column_type = Type::try_from(col_metadata.type_)?;
1168        let encodings = col_metadata
1169            .encodings
1170            .drain(0..)
1171            .map(Encoding::try_from)
1172            .collect::<Result<_>>()?;
1173        let compression = Compression::try_from(col_metadata.codec)?;
1174        let file_path = cc.file_path;
1175        let file_offset = cc.file_offset;
1176        let num_values = col_metadata.num_values;
1177        let total_compressed_size = col_metadata.total_compressed_size;
1178        let total_uncompressed_size = col_metadata.total_uncompressed_size;
1179        let data_page_offset = col_metadata.data_page_offset;
1180        let index_page_offset = col_metadata.index_page_offset;
1181        let dictionary_page_offset = col_metadata.dictionary_page_offset;
1182        let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1183        let encoding_stats = col_metadata
1184            .encoding_stats
1185            .as_ref()
1186            .map(|vec| {
1187                vec.iter()
1188                    .map(page_encoding_stats::try_from_thrift)
1189                    .collect::<Result<_>>()
1190            })
1191            .transpose()?;
1192        let bloom_filter_offset = col_metadata.bloom_filter_offset;
1193        let bloom_filter_length = col_metadata.bloom_filter_length;
1194        let offset_index_offset = cc.offset_index_offset;
1195        let offset_index_length = cc.offset_index_length;
1196        let column_index_offset = cc.column_index_offset;
1197        let column_index_length = cc.column_index_length;
1198        let (
1199            unencoded_byte_array_data_bytes,
1200            repetition_level_histogram,
1201            definition_level_histogram,
1202        ) = if let Some(size_stats) = col_metadata.size_statistics {
1203            (
1204                size_stats.unencoded_byte_array_data_bytes,
1205                size_stats.repetition_level_histogram,
1206                size_stats.definition_level_histogram,
1207            )
1208        } else {
1209            (None, None, None)
1210        };
1211
1212        let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1213        let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1214
1215        #[cfg(feature = "encryption")]
1216        let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1217            Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1218        } else {
1219            None
1220        };
1221
1222        let result = ColumnChunkMetaData {
1223            column_descr,
1224            encodings,
1225            file_path,
1226            file_offset,
1227            num_values,
1228            compression,
1229            total_compressed_size,
1230            total_uncompressed_size,
1231            data_page_offset,
1232            index_page_offset,
1233            dictionary_page_offset,
1234            statistics,
1235            encoding_stats,
1236            bloom_filter_offset,
1237            bloom_filter_length,
1238            offset_index_offset,
1239            offset_index_length,
1240            column_index_offset,
1241            column_index_length,
1242            unencoded_byte_array_data_bytes,
1243            repetition_level_histogram,
1244            definition_level_histogram,
1245            #[cfg(feature = "encryption")]
1246            column_crypto_metadata,
1247        };
1248        Ok(result)
1249    }
1250
1251    /// Method to convert to Thrift.
1252    pub fn to_thrift(&self) -> ColumnChunk {
1253        let column_metadata = self.to_column_metadata_thrift();
1254
1255        ColumnChunk {
1256            file_path: self.file_path().map(|s| s.to_owned()),
1257            file_offset: self.file_offset,
1258            meta_data: Some(column_metadata),
1259            offset_index_offset: self.offset_index_offset,
1260            offset_index_length: self.offset_index_length,
1261            column_index_offset: self.column_index_offset,
1262            column_index_length: self.column_index_length,
1263            crypto_metadata: self.column_crypto_metadata_thrift(),
1264            encrypted_column_metadata: None,
1265        }
1266    }
1267
1268    /// Method to convert to Thrift `ColumnMetaData`
1269    pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1270        let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1271            || self.repetition_level_histogram.is_some()
1272            || self.definition_level_histogram.is_some()
1273        {
1274            let repetition_level_histogram = self
1275                .repetition_level_histogram
1276                .as_ref()
1277                .map(|hist| hist.clone().into_inner());
1278
1279            let definition_level_histogram = self
1280                .definition_level_histogram
1281                .as_ref()
1282                .map(|hist| hist.clone().into_inner());
1283
1284            Some(SizeStatistics {
1285                unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1286                repetition_level_histogram,
1287                definition_level_histogram,
1288            })
1289        } else {
1290            None
1291        };
1292
1293        ColumnMetaData {
1294            type_: self.column_type().into(),
1295            encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1296            path_in_schema: self.column_path().as_ref().to_vec(),
1297            codec: self.compression.into(),
1298            num_values: self.num_values,
1299            total_uncompressed_size: self.total_uncompressed_size,
1300            total_compressed_size: self.total_compressed_size,
1301            key_value_metadata: None,
1302            data_page_offset: self.data_page_offset,
1303            index_page_offset: self.index_page_offset,
1304            dictionary_page_offset: self.dictionary_page_offset,
1305            statistics: statistics::to_thrift(self.statistics.as_ref()),
1306            encoding_stats: self
1307                .encoding_stats
1308                .as_ref()
1309                .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1310            bloom_filter_offset: self.bloom_filter_offset,
1311            bloom_filter_length: self.bloom_filter_length,
1312            size_statistics,
1313        }
1314    }
1315
1316    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1317    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1318        ColumnChunkMetaDataBuilder::from(self)
1319    }
1320
1321    #[cfg(feature = "encryption")]
1322    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1323        self.column_crypto_metadata
1324            .as_ref()
1325            .map(column_crypto_metadata::to_thrift)
1326    }
1327
1328    #[cfg(not(feature = "encryption"))]
1329    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1330        None
1331    }
1332}
1333
1334/// Builder for [`ColumnChunkMetaData`]
1335///
1336/// This builder is used to create a new column chunk metadata or modify an
1337/// existing one.
1338///
1339/// # Example
1340/// ```no_run
1341/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1342/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1343/// let column_chunk_metadata = get_column_chunk_metadata();
1344/// // create a new builder from existing column chunk metadata
1345/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1346/// // clear the statistics:
1347/// let column_chunk_metadata: ColumnChunkMetaData = builder
1348///   .clear_statistics()
1349///   .build()
1350///   .unwrap();
1351/// ```
1352pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1353
1354impl ColumnChunkMetaDataBuilder {
1355    /// Creates new column chunk metadata builder.
1356    ///
1357    /// See also [`ColumnChunkMetaData::builder`]
1358    fn new(column_descr: ColumnDescPtr) -> Self {
1359        Self(ColumnChunkMetaData {
1360            column_descr,
1361            encodings: Vec::new(),
1362            file_path: None,
1363            file_offset: 0,
1364            num_values: 0,
1365            compression: Compression::UNCOMPRESSED,
1366            total_compressed_size: 0,
1367            total_uncompressed_size: 0,
1368            data_page_offset: 0,
1369            index_page_offset: None,
1370            dictionary_page_offset: None,
1371            statistics: None,
1372            encoding_stats: None,
1373            bloom_filter_offset: None,
1374            bloom_filter_length: None,
1375            offset_index_offset: None,
1376            offset_index_length: None,
1377            column_index_offset: None,
1378            column_index_length: None,
1379            unencoded_byte_array_data_bytes: None,
1380            repetition_level_histogram: None,
1381            definition_level_histogram: None,
1382            #[cfg(feature = "encryption")]
1383            column_crypto_metadata: None,
1384        })
1385    }
1386
1387    /// Sets list of encodings for this column chunk.
1388    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1389        self.0.encodings = encodings;
1390        self
1391    }
1392
1393    /// Sets optional file path for this column chunk.
1394    pub fn set_file_path(mut self, value: String) -> Self {
1395        self.0.file_path = Some(value);
1396        self
1397    }
1398
1399    /// Sets file offset in bytes.
1400    ///
1401    /// This field was meant to provide an alternate to storing `ColumnMetadata` directly in
1402    /// the `ColumnChunkMetadata`. However, most Parquet readers assume the `ColumnMetadata`
1403    /// is stored inline and ignore this field.
1404    #[deprecated(
1405        since = "53.0.0",
1406        note = "The Parquet specification requires this field to be 0"
1407    )]
1408    pub fn set_file_offset(mut self, value: i64) -> Self {
1409        self.0.file_offset = value;
1410        self
1411    }
1412
1413    /// Sets number of values.
1414    pub fn set_num_values(mut self, value: i64) -> Self {
1415        self.0.num_values = value;
1416        self
1417    }
1418
1419    /// Sets compression.
1420    pub fn set_compression(mut self, value: Compression) -> Self {
1421        self.0.compression = value;
1422        self
1423    }
1424
1425    /// Sets total compressed size in bytes.
1426    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1427        self.0.total_compressed_size = value;
1428        self
1429    }
1430
1431    /// Sets total uncompressed size in bytes.
1432    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1433        self.0.total_uncompressed_size = value;
1434        self
1435    }
1436
1437    /// Sets data page offset in bytes.
1438    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1439        self.0.data_page_offset = value;
1440        self
1441    }
1442
1443    /// Sets optional dictionary page offset in bytes.
1444    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1445        self.0.dictionary_page_offset = value;
1446        self
1447    }
1448
1449    /// Sets optional index page offset in bytes.
1450    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1451        self.0.index_page_offset = value;
1452        self
1453    }
1454
1455    /// Sets statistics for this column chunk.
1456    pub fn set_statistics(mut self, value: Statistics) -> Self {
1457        self.0.statistics = Some(value);
1458        self
1459    }
1460
1461    /// Clears the statistics for this column chunk.
1462    pub fn clear_statistics(mut self) -> Self {
1463        self.0.statistics = None;
1464        self
1465    }
1466
1467    /// Sets page encoding stats for this column chunk.
1468    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1469        self.0.encoding_stats = Some(value);
1470        self
1471    }
1472
1473    /// Clears the page encoding stats for this column chunk.
1474    pub fn clear_page_encoding_stats(mut self) -> Self {
1475        self.0.encoding_stats = None;
1476        self
1477    }
1478
1479    /// Sets optional bloom filter offset in bytes.
1480    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1481        self.0.bloom_filter_offset = value;
1482        self
1483    }
1484
1485    /// Sets optional bloom filter length in bytes.
1486    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1487        self.0.bloom_filter_length = value;
1488        self
1489    }
1490
1491    /// Sets optional offset index offset in bytes.
1492    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1493        self.0.offset_index_offset = value;
1494        self
1495    }
1496
1497    /// Sets optional offset index length in bytes.
1498    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1499        self.0.offset_index_length = value;
1500        self
1501    }
1502
1503    /// Sets optional column index offset in bytes.
1504    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1505        self.0.column_index_offset = value;
1506        self
1507    }
1508
1509    /// Sets optional column index length in bytes.
1510    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1511        self.0.column_index_length = value;
1512        self
1513    }
1514
1515    /// Sets optional length of variable length data in bytes.
1516    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1517        self.0.unencoded_byte_array_data_bytes = value;
1518        self
1519    }
1520
1521    /// Sets optional repetition level histogram
1522    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1523        self.0.repetition_level_histogram = value;
1524        self
1525    }
1526
1527    /// Sets optional repetition level histogram
1528    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1529        self.0.definition_level_histogram = value;
1530        self
1531    }
1532
1533    #[cfg(feature = "encryption")]
1534    /// Set the encryption metadata for an encrypted column
1535    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1536        self.0.column_crypto_metadata = value;
1537        self
1538    }
1539
1540    /// Builds column chunk metadata.
1541    pub fn build(self) -> Result<ColumnChunkMetaData> {
1542        Ok(self.0)
1543    }
1544}
1545
1546/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1547///
1548/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1549pub struct ColumnIndexBuilder {
1550    null_pages: Vec<bool>,
1551    min_values: Vec<Vec<u8>>,
1552    max_values: Vec<Vec<u8>>,
1553    null_counts: Vec<i64>,
1554    boundary_order: BoundaryOrder,
1555    /// contains the concatenation of the histograms of all pages
1556    repetition_level_histograms: Option<Vec<i64>>,
1557    /// contains the concatenation of the histograms of all pages
1558    definition_level_histograms: Option<Vec<i64>>,
1559    /// Is the information in the builder valid?
1560    ///
1561    /// Set to `false` if any entry in the page doesn't have statistics for
1562    /// some reason, so statistics for that page won't be written to the file.
1563    /// This might happen if the page is entirely null, or
1564    /// is a floating point column without any non-nan values
1565    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1566    valid: bool,
1567}
1568
1569impl Default for ColumnIndexBuilder {
1570    fn default() -> Self {
1571        Self::new()
1572    }
1573}
1574
1575impl ColumnIndexBuilder {
1576    /// Creates a new column index builder.
1577    pub fn new() -> Self {
1578        ColumnIndexBuilder {
1579            null_pages: Vec::new(),
1580            min_values: Vec::new(),
1581            max_values: Vec::new(),
1582            null_counts: Vec::new(),
1583            boundary_order: BoundaryOrder::UNORDERED,
1584            repetition_level_histograms: None,
1585            definition_level_histograms: None,
1586            valid: true,
1587        }
1588    }
1589
1590    /// Append statistics for the next page
1591    pub fn append(
1592        &mut self,
1593        null_page: bool,
1594        min_value: Vec<u8>,
1595        max_value: Vec<u8>,
1596        null_count: i64,
1597    ) {
1598        self.null_pages.push(null_page);
1599        self.min_values.push(min_value);
1600        self.max_values.push(max_value);
1601        self.null_counts.push(null_count);
1602    }
1603
1604    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1605    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1606    pub fn append_histograms(
1607        &mut self,
1608        repetition_level_histogram: &Option<LevelHistogram>,
1609        definition_level_histogram: &Option<LevelHistogram>,
1610    ) {
1611        if !self.valid {
1612            return;
1613        }
1614        if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1615            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1616            hist.reserve(rep_lvl_hist.len());
1617            hist.extend(rep_lvl_hist.values());
1618        }
1619        if let Some(ref def_lvl_hist) = definition_level_histogram {
1620            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1621            hist.reserve(def_lvl_hist.len());
1622            hist.extend(def_lvl_hist.values());
1623        }
1624    }
1625
1626    /// Set the boundary order of the column index
1627    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1628        self.boundary_order = boundary_order;
1629    }
1630
1631    /// Mark this column index as invalid
1632    pub fn to_invalid(&mut self) {
1633        self.valid = false;
1634    }
1635
1636    /// Is the information in the builder valid?
1637    pub fn valid(&self) -> bool {
1638        self.valid
1639    }
1640
1641    /// Build and get the thrift metadata of column index
1642    ///
1643    /// Note: callers should check [`Self::valid`] before calling this method
1644    pub fn build_to_thrift(self) -> ColumnIndex {
1645        ColumnIndex::new(
1646            self.null_pages,
1647            self.min_values,
1648            self.max_values,
1649            self.boundary_order,
1650            self.null_counts,
1651            self.repetition_level_histograms,
1652            self.definition_level_histograms,
1653        )
1654    }
1655}
1656
1657impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1658    fn from(value: ColumnChunkMetaData) -> Self {
1659        ColumnChunkMetaDataBuilder(value)
1660    }
1661}
1662
1663/// Builder for offset index, part of the Parquet [PageIndex].
1664///
1665/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1666pub struct OffsetIndexBuilder {
1667    offset_array: Vec<i64>,
1668    compressed_page_size_array: Vec<i32>,
1669    first_row_index_array: Vec<i64>,
1670    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1671    current_first_row_index: i64,
1672}
1673
1674impl Default for OffsetIndexBuilder {
1675    fn default() -> Self {
1676        Self::new()
1677    }
1678}
1679
1680impl OffsetIndexBuilder {
1681    /// Creates a new offset index builder.
1682    pub fn new() -> Self {
1683        OffsetIndexBuilder {
1684            offset_array: Vec::new(),
1685            compressed_page_size_array: Vec::new(),
1686            first_row_index_array: Vec::new(),
1687            unencoded_byte_array_data_bytes_array: None,
1688            current_first_row_index: 0,
1689        }
1690    }
1691
1692    /// Append the row count of the next page.
1693    pub fn append_row_count(&mut self, row_count: i64) {
1694        let current_page_row_index = self.current_first_row_index;
1695        self.first_row_index_array.push(current_page_row_index);
1696        self.current_first_row_index += row_count;
1697    }
1698
1699    /// Append the offset and size of the next page.
1700    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1701        self.offset_array.push(offset);
1702        self.compressed_page_size_array.push(compressed_page_size);
1703    }
1704
1705    /// Append the unencoded byte array data bytes of the next page.
1706    pub fn append_unencoded_byte_array_data_bytes(
1707        &mut self,
1708        unencoded_byte_array_data_bytes: Option<i64>,
1709    ) {
1710        if let Some(val) = unencoded_byte_array_data_bytes {
1711            self.unencoded_byte_array_data_bytes_array
1712                .get_or_insert(Vec::new())
1713                .push(val);
1714        }
1715    }
1716
1717    /// Build and get the thrift metadata of offset index
1718    pub fn build_to_thrift(self) -> OffsetIndex {
1719        let locations = self
1720            .offset_array
1721            .iter()
1722            .zip(self.compressed_page_size_array.iter())
1723            .zip(self.first_row_index_array.iter())
1724            .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1725            .collect::<Vec<_>>();
1726        OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1727    }
1728}
1729
1730#[cfg(test)]
1731mod tests {
1732    use super::*;
1733    use crate::basic::{PageType, SortOrder};
1734    use crate::file::page_index::index::NativeIndex;
1735
1736    #[test]
1737    fn test_row_group_metadata_thrift_conversion() {
1738        let schema_descr = get_test_schema_descr();
1739
1740        let mut columns = vec![];
1741        for ptr in schema_descr.columns() {
1742            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1743            columns.push(column);
1744        }
1745        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1746            .set_num_rows(1000)
1747            .set_total_byte_size(2000)
1748            .set_column_metadata(columns)
1749            .set_ordinal(1)
1750            .build()
1751            .unwrap();
1752
1753        let row_group_exp = row_group_meta.to_thrift();
1754        let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1755            .unwrap()
1756            .to_thrift();
1757
1758        assert_eq!(row_group_res, row_group_exp);
1759    }
1760
1761    #[test]
1762    fn test_row_group_metadata_thrift_conversion_empty() {
1763        let schema_descr = get_test_schema_descr();
1764
1765        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1766
1767        assert!(row_group_meta.is_err());
1768        if let Err(e) = row_group_meta {
1769            assert_eq!(
1770                format!("{e}"),
1771                "Parquet error: Column length mismatch: 2 != 0"
1772            );
1773        }
1774    }
1775
1776    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1777    #[test]
1778    fn test_row_group_metadata_thrift_corrupted() {
1779        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1780            SchemaType::group_type_builder("schema")
1781                .with_fields(vec![
1782                    Arc::new(
1783                        SchemaType::primitive_type_builder("a", Type::INT32)
1784                            .build()
1785                            .unwrap(),
1786                    ),
1787                    Arc::new(
1788                        SchemaType::primitive_type_builder("b", Type::INT32)
1789                            .build()
1790                            .unwrap(),
1791                    ),
1792                ])
1793                .build()
1794                .unwrap(),
1795        )));
1796
1797        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1798            SchemaType::group_type_builder("schema")
1799                .with_fields(vec![
1800                    Arc::new(
1801                        SchemaType::primitive_type_builder("a", Type::INT32)
1802                            .build()
1803                            .unwrap(),
1804                    ),
1805                    Arc::new(
1806                        SchemaType::primitive_type_builder("b", Type::INT32)
1807                            .build()
1808                            .unwrap(),
1809                    ),
1810                    Arc::new(
1811                        SchemaType::primitive_type_builder("c", Type::INT32)
1812                            .build()
1813                            .unwrap(),
1814                    ),
1815                ])
1816                .build()
1817                .unwrap(),
1818        )));
1819
1820        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1821            .set_num_rows(1000)
1822            .set_total_byte_size(2000)
1823            .set_column_metadata(vec![
1824                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1825                    .build()
1826                    .unwrap(),
1827                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1828                    .build()
1829                    .unwrap(),
1830            ])
1831            .set_ordinal(1)
1832            .build()
1833            .unwrap();
1834
1835        let err =
1836            RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1837                .unwrap_err()
1838                .to_string();
1839        assert_eq!(
1840            err,
1841            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1842        );
1843    }
1844
1845    #[test]
1846    fn test_column_chunk_metadata_thrift_conversion() {
1847        let column_descr = get_test_schema_descr().column(0);
1848
1849        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1850            .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1851            .set_file_path("file_path".to_owned())
1852            .set_num_values(1000)
1853            .set_compression(Compression::SNAPPY)
1854            .set_total_compressed_size(2000)
1855            .set_total_uncompressed_size(3000)
1856            .set_data_page_offset(4000)
1857            .set_dictionary_page_offset(Some(5000))
1858            .set_page_encoding_stats(vec![
1859                PageEncodingStats {
1860                    page_type: PageType::DATA_PAGE,
1861                    encoding: Encoding::PLAIN,
1862                    count: 3,
1863                },
1864                PageEncodingStats {
1865                    page_type: PageType::DATA_PAGE,
1866                    encoding: Encoding::RLE,
1867                    count: 5,
1868                },
1869            ])
1870            .set_bloom_filter_offset(Some(6000))
1871            .set_bloom_filter_length(Some(25))
1872            .set_offset_index_offset(Some(7000))
1873            .set_offset_index_length(Some(25))
1874            .set_column_index_offset(Some(8000))
1875            .set_column_index_length(Some(25))
1876            .set_unencoded_byte_array_data_bytes(Some(2000))
1877            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1878            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1879            .build()
1880            .unwrap();
1881
1882        let col_chunk_res =
1883            ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1884
1885        assert_eq!(col_chunk_res, col_metadata);
1886    }
1887
1888    #[test]
1889    fn test_column_chunk_metadata_thrift_conversion_empty() {
1890        let column_descr = get_test_schema_descr().column(0);
1891
1892        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1893            .build()
1894            .unwrap();
1895
1896        let col_chunk_exp = col_metadata.to_thrift();
1897        let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1898            .unwrap()
1899            .to_thrift();
1900
1901        assert_eq!(col_chunk_res, col_chunk_exp);
1902    }
1903
1904    #[test]
1905    fn test_compressed_size() {
1906        let schema_descr = get_test_schema_descr();
1907
1908        let mut columns = vec![];
1909        for column_descr in schema_descr.columns() {
1910            let column = ColumnChunkMetaData::builder(column_descr.clone())
1911                .set_total_compressed_size(500)
1912                .set_total_uncompressed_size(700)
1913                .build()
1914                .unwrap();
1915            columns.push(column);
1916        }
1917        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1918            .set_num_rows(1000)
1919            .set_column_metadata(columns)
1920            .build()
1921            .unwrap();
1922
1923        let compressed_size_res: i64 = row_group_meta.compressed_size();
1924        let compressed_size_exp: i64 = 1000;
1925
1926        assert_eq!(compressed_size_res, compressed_size_exp);
1927    }
1928
1929    #[test]
1930    fn test_memory_size() {
1931        let schema_descr = get_test_schema_descr();
1932
1933        let columns = schema_descr
1934            .columns()
1935            .iter()
1936            .map(|column_descr| {
1937                ColumnChunkMetaData::builder(column_descr.clone())
1938                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1939                    .build()
1940            })
1941            .collect::<Result<Vec<_>>>()
1942            .unwrap();
1943        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1944            .set_num_rows(1000)
1945            .set_column_metadata(columns)
1946            .build()
1947            .unwrap();
1948        let row_group_meta = vec![row_group_meta];
1949
1950        let version = 2;
1951        let num_rows = 1000;
1952        let created_by = Some(String::from("test harness"));
1953        let key_value_metadata = Some(vec![KeyValue::new(
1954            String::from("Foo"),
1955            Some(String::from("bar")),
1956        )]);
1957        let column_orders = Some(vec![
1958            ColumnOrder::UNDEFINED,
1959            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1960        ]);
1961        let file_metadata = FileMetaData::new(
1962            version,
1963            num_rows,
1964            created_by,
1965            key_value_metadata,
1966            schema_descr.clone(),
1967            column_orders,
1968        );
1969
1970        // Now, add in Exact Statistics
1971        let columns_with_stats = schema_descr
1972            .columns()
1973            .iter()
1974            .map(|column_descr| {
1975                ColumnChunkMetaData::builder(column_descr.clone())
1976                    .set_statistics(Statistics::new::<i32>(
1977                        Some(0),
1978                        Some(100),
1979                        None,
1980                        None,
1981                        false,
1982                    ))
1983                    .build()
1984            })
1985            .collect::<Result<Vec<_>>>()
1986            .unwrap();
1987
1988        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1989            .set_num_rows(1000)
1990            .set_column_metadata(columns_with_stats)
1991            .build()
1992            .unwrap();
1993        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1994
1995        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1996            .set_row_groups(row_group_meta_with_stats)
1997            .build();
1998
1999        #[cfg(not(feature = "encryption"))]
2000        let base_expected_size = 2312;
2001        #[cfg(feature = "encryption")]
2002        let base_expected_size = 2640;
2003
2004        assert_eq!(parquet_meta.memory_size(), base_expected_size);
2005
2006        let mut column_index = ColumnIndexBuilder::new();
2007        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2008        let column_index = column_index.build_to_thrift();
2009        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
2010
2011        // Now, add in OffsetIndex
2012        let mut offset_index = OffsetIndexBuilder::new();
2013        offset_index.append_row_count(1);
2014        offset_index.append_offset_and_size(2, 3);
2015        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2016        offset_index.append_row_count(1);
2017        offset_index.append_offset_and_size(2, 3);
2018        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2019        let offset_index = offset_index.build_to_thrift();
2020
2021        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2022            .set_row_groups(row_group_meta)
2023            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
2024            .set_offset_index(Some(vec![vec![
2025                OffsetIndexMetaData::try_new(offset_index).unwrap()
2026            ]]))
2027            .build();
2028
2029        #[cfg(not(feature = "encryption"))]
2030        let bigger_expected_size = 2816;
2031        #[cfg(feature = "encryption")]
2032        let bigger_expected_size = 3144;
2033
2034        // more set fields means more memory usage
2035        assert!(bigger_expected_size > base_expected_size);
2036        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2037    }
2038
2039    /// Returns sample schema descriptor so we can create column metadata.
2040    fn get_test_schema_descr() -> SchemaDescPtr {
2041        let schema = SchemaType::group_type_builder("schema")
2042            .with_fields(vec![
2043                Arc::new(
2044                    SchemaType::primitive_type_builder("a", Type::INT32)
2045                        .build()
2046                        .unwrap(),
2047                ),
2048                Arc::new(
2049                    SchemaType::primitive_type_builder("b", Type::INT32)
2050                        .build()
2051                        .unwrap(),
2052                ),
2053            ])
2054            .build()
2055            .unwrap();
2056
2057        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2058    }
2059}