Skip to main content

parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//! # Examples
46//!
47//! Please see [`external_metadata.rs`]
48//!
49//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
50//!
51//! # Metadata Encodings and Structures
52//!
53//! There are three different encodings of Parquet Metadata in this crate:
54//!
55//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
56//!    [parquet.thrift]
57//!
58//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
59//!    from [parquet.thrift]. These structures are low level and mirror
60//!    the thrift definitions.
61//!
62//! 3. [`file::metadata`] (this module): Easier to use Rust structures
63//!    with a more idiomatic API. Note that, confusingly, some but not all
64//!    of these structures have the same name as the [`format`] structures.
65//!
66//! [`file::metadata`]: crate::file::metadata
67//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
68//!
69//! Graphically, this is how the different structures relate to each other:
70//!
71//! ```text
72//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
73//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
74//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
75//!                            └──────────────┘     │         └───────────────────────┘ │
76//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
77//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
78//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
79//!                                     ...         │                   ...             │
80//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
81//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
82//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
83//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
84//!
85//!                          format::meta structures          file::metadata structures
86//!
87//!                         * Same name, different struct
88//! ```
89mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112    Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116    basic::BoundaryOrder,
117    errors::{ParquetError, Result},
118};
119use crate::{
120    basic::{ColumnOrder, Compression, Encoding, Type},
121    parquet_thrift::{
122        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124    },
125};
126use crate::{
127    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
132pub use push_decoder::ParquetMetaDataPushDecoder;
133pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
134use std::io::Write;
135use std::ops::Range;
136use std::sync::Arc;
137pub use writer::ParquetMetaDataWriter;
138pub(crate) use writer::ThriftMetadataWriter;
139
140/// Page level statistics for each column chunk of each row group.
141///
142/// This structure is an in-memory representation of multiple [`ColumnIndex`]
143/// structures in a parquet file footer, as described in the Parquet [PageIndex
144/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
145/// particular column chunk.
146///
147/// `column_index[row_group_number][column_number]` holds the
148/// [`ColumnIndex`] corresponding to column `column_number` of row group
149/// `row_group_number`.
150///
151/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
152/// column in the third row group of the parquet file.
153///
154/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
155/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
156pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
157
158/// [`OffsetIndexMetaData`] for each data page of each row group of each column
159///
160/// This structure is the parsed representation of the [`OffsetIndex`] from the
161/// Parquet file footer, as described in the Parquet [PageIndex documentation].
162///
163/// `offset_index[row_group_number][column_number]` holds
164/// the [`OffsetIndexMetaData`] corresponding to column
165/// `column_number`of row group `row_group_number`.
166///
167/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
168/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
169pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
170
171/// Parsed metadata for a single Parquet file
172///
173/// This structure is stored in the footer of Parquet files, in the format
174/// defined by [`parquet.thrift`].
175///
176/// # Overview
177/// The fields of this structure are:
178/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
179/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
180/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
181///
182/// This structure is read by the various readers in this crate or can be read
183/// directly from a file using the [`ParquetMetaDataReader`] struct.
184///
185/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
186///
187/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
188#[derive(Debug, Clone, PartialEq)]
189pub struct ParquetMetaData {
190    /// File level metadata
191    file_metadata: FileMetaData,
192    /// Row group metadata
193    row_groups: Vec<RowGroupMetaData>,
194    /// Page level index for each page in each column chunk
195    column_index: Option<ParquetColumnIndex>,
196    /// Offset index for each page in each column chunk
197    offset_index: Option<ParquetOffsetIndex>,
198    /// Optional file decryptor
199    #[cfg(feature = "encryption")]
200    file_decryptor: Option<Box<FileDecryptor>>,
201}
202
203impl ParquetMetaData {
204    /// Creates Parquet metadata from file metadata and a list of row
205    /// group metadata
206    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
207        ParquetMetaData {
208            file_metadata,
209            row_groups,
210            column_index: None,
211            offset_index: None,
212            #[cfg(feature = "encryption")]
213            file_decryptor: None,
214        }
215    }
216
217    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
218    /// encrypted data.
219    #[cfg(feature = "encryption")]
220    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
221        self.file_decryptor = file_decryptor.map(Box::new);
222    }
223
224    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
225    pub fn into_builder(self) -> ParquetMetaDataBuilder {
226        self.into()
227    }
228
229    /// Returns file metadata as reference.
230    pub fn file_metadata(&self) -> &FileMetaData {
231        &self.file_metadata
232    }
233
234    /// Returns file decryptor as reference.
235    #[cfg(feature = "encryption")]
236    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
237        self.file_decryptor.as_deref()
238    }
239
240    /// Returns number of row groups in this file.
241    pub fn num_row_groups(&self) -> usize {
242        self.row_groups.len()
243    }
244
245    /// Returns row group metadata for `i`th position.
246    /// Position should be less than number of row groups `num_row_groups`.
247    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
248        &self.row_groups[i]
249    }
250
251    /// Returns slice of row groups in this file.
252    pub fn row_groups(&self) -> &[RowGroupMetaData] {
253        &self.row_groups
254    }
255
256    /// Returns the column index for this file if loaded
257    ///
258    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
259    /// [ArrowReaderOptions::with_page_index] was set to false.
260    ///
261    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
262    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
263        self.column_index.as_ref()
264    }
265
266    /// Returns offset indexes in this file, if loaded
267    ///
268    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
269    /// [ArrowReaderOptions::with_page_index] was set to false.
270    ///
271    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
272    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
273        self.offset_index.as_ref()
274    }
275
276    /// Estimate of the bytes allocated to store `ParquetMetadata`
277    ///
278    /// # Notes:
279    ///
280    /// 1. Includes size of self
281    ///
282    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
283    ///    [`RowGroupMetaData`].
284    ///
285    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
286    ///    means `memory_size` will over estimate the memory size if such pointers
287    ///    are shared.
288    ///
289    /// 4. Does not include any allocator overheads
290    pub fn memory_size(&self) -> usize {
291        #[cfg(feature = "encryption")]
292        let encryption_size = self.file_decryptor.heap_size();
293        #[cfg(not(feature = "encryption"))]
294        let encryption_size = 0usize;
295
296        std::mem::size_of::<Self>()
297            + self.file_metadata.heap_size()
298            + self.row_groups.heap_size()
299            + self.column_index.heap_size()
300            + self.offset_index.heap_size()
301            + encryption_size
302    }
303
304    /// Override the column index
305    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
306        self.column_index = index;
307    }
308
309    /// Override the offset index
310    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
311        self.offset_index = index;
312    }
313}
314
315/// A builder for creating / manipulating [`ParquetMetaData`]
316///
317/// # Example creating a new [`ParquetMetaData`]
318///
319///```no_run
320/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
321/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
322/// // Create a new builder given the file metadata
323/// let file_metadata = get_file_metadata();
324/// // Create a row group
325/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
326///    .set_num_rows(100)
327///    // ... (A real row group needs more than just the number of rows)
328///    .build()
329///    .unwrap();
330/// // Create the final metadata
331/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
332///   .add_row_group(row_group)
333///   .build();
334/// ```
335///
336/// # Example modifying an existing [`ParquetMetaData`]
337/// ```no_run
338/// # use parquet::file::metadata::ParquetMetaData;
339/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
340/// // Modify the metadata so only the last RowGroup remains
341/// let metadata: ParquetMetaData = load_metadata();
342/// let mut builder = metadata.into_builder();
343///
344/// // Take existing row groups to modify
345/// let mut row_groups = builder.take_row_groups();
346/// let last_row_group = row_groups.pop().unwrap();
347///
348/// let metadata = builder
349///   .add_row_group(last_row_group)
350///   .build();
351/// ```
352pub struct ParquetMetaDataBuilder(ParquetMetaData);
353
354impl ParquetMetaDataBuilder {
355    /// Create a new builder from a file metadata, with no row groups
356    pub fn new(file_meta_data: FileMetaData) -> Self {
357        Self(ParquetMetaData::new(file_meta_data, vec![]))
358    }
359
360    /// Create a new builder from an existing ParquetMetaData
361    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
362        Self(metadata)
363    }
364
365    /// Adds a row group to the metadata
366    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
367        self.0.row_groups.push(row_group);
368        self
369    }
370
371    /// Sets all the row groups to the specified list
372    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
373        self.0.row_groups = row_groups;
374        self
375    }
376
377    /// Takes ownership of the row groups in this builder, and clears the list
378    /// of row groups.
379    ///
380    /// This can be used for more efficient creation of a new ParquetMetaData
381    /// from an existing one.
382    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
383        std::mem::take(&mut self.0.row_groups)
384    }
385
386    /// Return a reference to the current row groups
387    pub fn row_groups(&self) -> &[RowGroupMetaData] {
388        &self.0.row_groups
389    }
390
391    /// Sets the column index
392    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
393        self.0.column_index = column_index;
394        self
395    }
396
397    /// Returns the current column index from the builder, replacing it with `None`
398    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
399        std::mem::take(&mut self.0.column_index)
400    }
401
402    /// Return a reference to the current column index, if any
403    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
404        self.0.column_index.as_ref()
405    }
406
407    /// Sets the offset index
408    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
409        self.0.offset_index = offset_index;
410        self
411    }
412
413    /// Returns the current offset index from the builder, replacing it with `None`
414    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
415        std::mem::take(&mut self.0.offset_index)
416    }
417
418    /// Return a reference to the current offset index, if any
419    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
420        self.0.offset_index.as_ref()
421    }
422
423    /// Sets the file decryptor needed to decrypt this metadata.
424    #[cfg(feature = "encryption")]
425    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
426        self.0.with_file_decryptor(file_decryptor);
427        self
428    }
429
430    /// Creates a new ParquetMetaData from the builder
431    pub fn build(self) -> ParquetMetaData {
432        let Self(metadata) = self;
433        metadata
434    }
435}
436
437impl From<ParquetMetaData> for ParquetMetaDataBuilder {
438    fn from(meta_data: ParquetMetaData) -> Self {
439        Self(meta_data)
440    }
441}
442
443thrift_struct!(
444/// A key-value pair for [`FileMetaData`].
445pub struct KeyValue {
446  1: required string key
447  2: optional string value
448}
449);
450
451impl KeyValue {
452    /// Create a new key value pair
453    pub fn new<F2>(key: String, value: F2) -> KeyValue
454    where
455        F2: Into<Option<String>>,
456    {
457        KeyValue {
458            key,
459            value: value.into(),
460        }
461    }
462}
463
464thrift_struct!(
465/// PageEncodingStats for a column chunk and data page.
466pub struct PageEncodingStats {
467  1: required PageType page_type;
468  2: required Encoding encoding;
469  3: required i32 count;
470}
471);
472
473/// Internal representation of the page encoding stats in the [`ColumnChunkMetaData`].
474/// This is not publicly exposed, with different getters defined for each variant.
475#[derive(Debug, Clone, PartialEq)]
476enum ParquetPageEncodingStats {
477    /// The full array of stats as defined in the Parquet spec.
478    Full(Vec<PageEncodingStats>),
479    /// A condensed version of only page encodings seen.
480    Mask(EncodingMask),
481}
482
483/// Reference counted pointer for [`FileMetaData`].
484pub type FileMetaDataPtr = Arc<FileMetaData>;
485
486/// File level metadata for a Parquet file.
487///
488/// Includes the version of the file, metadata, number of rows, schema, and column orders
489#[derive(Debug, Clone, PartialEq)]
490pub struct FileMetaData {
491    version: i32,
492    num_rows: i64,
493    created_by: Option<String>,
494    key_value_metadata: Option<Vec<KeyValue>>,
495    schema_descr: SchemaDescPtr,
496    column_orders: Option<Vec<ColumnOrder>>,
497    #[cfg(feature = "encryption")]
498    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
499    #[cfg(feature = "encryption")]
500    footer_signing_key_metadata: Option<Vec<u8>>,
501}
502
503impl FileMetaData {
504    /// Creates new file metadata.
505    pub fn new(
506        version: i32,
507        num_rows: i64,
508        created_by: Option<String>,
509        key_value_metadata: Option<Vec<KeyValue>>,
510        schema_descr: SchemaDescPtr,
511        column_orders: Option<Vec<ColumnOrder>>,
512    ) -> Self {
513        FileMetaData {
514            version,
515            num_rows,
516            created_by,
517            key_value_metadata,
518            schema_descr,
519            column_orders,
520            #[cfg(feature = "encryption")]
521            encryption_algorithm: None,
522            #[cfg(feature = "encryption")]
523            footer_signing_key_metadata: None,
524        }
525    }
526
527    #[cfg(feature = "encryption")]
528    pub(crate) fn with_encryption_algorithm(
529        mut self,
530        encryption_algorithm: Option<EncryptionAlgorithm>,
531    ) -> Self {
532        self.encryption_algorithm = encryption_algorithm.map(Box::new);
533        self
534    }
535
536    #[cfg(feature = "encryption")]
537    pub(crate) fn with_footer_signing_key_metadata(
538        mut self,
539        footer_signing_key_metadata: Option<Vec<u8>>,
540    ) -> Self {
541        self.footer_signing_key_metadata = footer_signing_key_metadata;
542        self
543    }
544
545    /// Returns version of this file.
546    pub fn version(&self) -> i32 {
547        self.version
548    }
549
550    /// Returns number of rows in the file.
551    pub fn num_rows(&self) -> i64 {
552        self.num_rows
553    }
554
555    /// String message for application that wrote this file.
556    ///
557    /// This should have the following format:
558    /// `<application> version <application version> (build <application build hash>)`.
559    ///
560    /// ```shell
561    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
562    /// ```
563    pub fn created_by(&self) -> Option<&str> {
564        self.created_by.as_deref()
565    }
566
567    /// Returns key_value_metadata of this file.
568    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
569        self.key_value_metadata.as_ref()
570    }
571
572    /// Returns Parquet [`Type`] that describes schema in this file.
573    ///
574    /// [`Type`]: crate::schema::types::Type
575    pub fn schema(&self) -> &SchemaType {
576        self.schema_descr.root_schema()
577    }
578
579    /// Returns a reference to schema descriptor.
580    pub fn schema_descr(&self) -> &SchemaDescriptor {
581        &self.schema_descr
582    }
583
584    /// Returns reference counted clone for schema descriptor.
585    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
586        self.schema_descr.clone()
587    }
588
589    /// Column (sort) order used for `min` and `max` values of each column in this file.
590    ///
591    /// Each column order corresponds to one column, determined by its position in the
592    /// list, matching the position of the column in the schema.
593    ///
594    /// When `None` is returned, there are no column orders available, and each column
595    /// should be assumed to have undefined (legacy) column order.
596    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
597        self.column_orders.as_ref()
598    }
599
600    /// Returns column order for `i`th column in this file.
601    /// If column orders are not available, returns undefined (legacy) column order.
602    pub fn column_order(&self, i: usize) -> ColumnOrder {
603        self.column_orders
604            .as_ref()
605            .map(|data| data[i])
606            .unwrap_or(ColumnOrder::UNDEFINED)
607    }
608}
609
610thrift_struct!(
611/// Sort order within a RowGroup of a leaf column
612pub struct SortingColumn {
613  /// The ordinal position of the column (in this row group)
614  1: required i32 column_idx
615
616  /// If true, indicates this column is sorted in descending order.
617  2: required bool descending
618
619  /// If true, nulls will come before non-null values, otherwise,
620  /// nulls go at the end. */
621  3: required bool nulls_first
622}
623);
624
625/// Reference counted pointer for [`RowGroupMetaData`].
626pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
627
628/// Metadata for a row group
629///
630/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
631/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
632#[derive(Debug, Clone, PartialEq)]
633pub struct RowGroupMetaData {
634    columns: Vec<ColumnChunkMetaData>,
635    num_rows: i64,
636    sorting_columns: Option<Vec<SortingColumn>>,
637    total_byte_size: i64,
638    schema_descr: SchemaDescPtr,
639    /// We can't infer from file offset of first column since there may empty columns in row group.
640    file_offset: Option<i64>,
641    /// Ordinal position of this row group in file
642    ordinal: Option<i16>,
643}
644
645impl RowGroupMetaData {
646    /// Returns builder for row group metadata.
647    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
648        RowGroupMetaDataBuilder::new(schema_descr)
649    }
650
651    /// Number of columns in this row group.
652    pub fn num_columns(&self) -> usize {
653        self.columns.len()
654    }
655
656    /// Returns column chunk metadata for `i`th column.
657    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
658        &self.columns[i]
659    }
660
661    /// Returns slice of column chunk metadata.
662    pub fn columns(&self) -> &[ColumnChunkMetaData] {
663        &self.columns
664    }
665
666    /// Returns mutable slice of column chunk metadata.
667    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
668        &mut self.columns
669    }
670
671    /// Number of rows in this row group.
672    pub fn num_rows(&self) -> i64 {
673        self.num_rows
674    }
675
676    /// Returns the sort ordering of the rows in this RowGroup if any
677    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
678        self.sorting_columns.as_ref()
679    }
680
681    /// Total byte size of all uncompressed column data in this row group.
682    pub fn total_byte_size(&self) -> i64 {
683        self.total_byte_size
684    }
685
686    /// Total size of all compressed column data in this row group.
687    pub fn compressed_size(&self) -> i64 {
688        self.columns.iter().map(|c| c.total_compressed_size).sum()
689    }
690
691    /// Returns reference to a schema descriptor.
692    pub fn schema_descr(&self) -> &SchemaDescriptor {
693        self.schema_descr.as_ref()
694    }
695
696    /// Returns reference counted clone of schema descriptor.
697    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
698        self.schema_descr.clone()
699    }
700
701    /// Returns ordinal position of this row group in file.
702    ///
703    /// For example if this is the first row group in the file, this will return 0.
704    /// If this is the second row group in the file, this will return 1.
705    #[inline(always)]
706    pub fn ordinal(&self) -> Option<i16> {
707        self.ordinal
708    }
709
710    /// Returns file offset of this row group in file.
711    #[inline(always)]
712    pub fn file_offset(&self) -> Option<i64> {
713        self.file_offset
714    }
715
716    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
717    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
718        RowGroupMetaDataBuilder(self)
719    }
720}
721
722/// Builder for row group metadata.
723pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
724
725impl RowGroupMetaDataBuilder {
726    /// Creates new builder from schema descriptor.
727    fn new(schema_descr: SchemaDescPtr) -> Self {
728        Self(RowGroupMetaData {
729            columns: Vec::with_capacity(schema_descr.num_columns()),
730            schema_descr,
731            file_offset: None,
732            num_rows: 0,
733            sorting_columns: None,
734            total_byte_size: 0,
735            ordinal: None,
736        })
737    }
738
739    /// Sets number of rows in this row group.
740    pub fn set_num_rows(mut self, value: i64) -> Self {
741        self.0.num_rows = value;
742        self
743    }
744
745    /// Sets the sorting order for columns
746    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
747        self.0.sorting_columns = value;
748        self
749    }
750
751    /// Sets total size in bytes for this row group.
752    pub fn set_total_byte_size(mut self, value: i64) -> Self {
753        self.0.total_byte_size = value;
754        self
755    }
756
757    /// Takes ownership of the the column metadata in this builder, and clears
758    /// the list of columns.
759    ///
760    /// This can be used for more efficient creation of a new RowGroupMetaData
761    /// from an existing one.
762    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
763        std::mem::take(&mut self.0.columns)
764    }
765
766    /// Sets column metadata for this row group.
767    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
768        self.0.columns = value;
769        self
770    }
771
772    /// Adds a column metadata to this row group
773    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
774        self.0.columns.push(value);
775        self
776    }
777
778    /// Sets ordinal for this row group.
779    pub fn set_ordinal(mut self, value: i16) -> Self {
780        self.0.ordinal = Some(value);
781        self
782    }
783
784    /// Sets file offset for this row group.
785    pub fn set_file_offset(mut self, value: i64) -> Self {
786        self.0.file_offset = Some(value);
787        self
788    }
789
790    /// Builds row group metadata.
791    pub fn build(self) -> Result<RowGroupMetaData> {
792        if self.0.schema_descr.num_columns() != self.0.columns.len() {
793            return Err(general_err!(
794                "Column length mismatch: {} != {}",
795                self.0.schema_descr.num_columns(),
796                self.0.columns.len()
797            ));
798        }
799
800        Ok(self.0)
801    }
802
803    /// Build row group metadata without validation.
804    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
805        self.0
806    }
807}
808
809/// Metadata for a column chunk.
810#[derive(Debug, Clone, PartialEq)]
811pub struct ColumnChunkMetaData {
812    column_descr: ColumnDescPtr,
813    encodings: EncodingMask,
814    file_path: Option<String>,
815    file_offset: i64,
816    num_values: i64,
817    compression: Compression,
818    total_compressed_size: i64,
819    total_uncompressed_size: i64,
820    data_page_offset: i64,
821    index_page_offset: Option<i64>,
822    dictionary_page_offset: Option<i64>,
823    statistics: Option<Statistics>,
824    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
825    encoding_stats: Option<ParquetPageEncodingStats>,
826    bloom_filter_offset: Option<i64>,
827    bloom_filter_length: Option<i32>,
828    offset_index_offset: Option<i64>,
829    offset_index_length: Option<i32>,
830    column_index_offset: Option<i64>,
831    column_index_length: Option<i32>,
832    unencoded_byte_array_data_bytes: Option<i64>,
833    repetition_level_histogram: Option<LevelHistogram>,
834    definition_level_histogram: Option<LevelHistogram>,
835    #[cfg(feature = "encryption")]
836    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
837    #[cfg(feature = "encryption")]
838    encrypted_column_metadata: Option<Vec<u8>>,
839    /// When true, indicates the footer is plaintext (not encrypted).
840    /// This affects how column metadata is serialized when `encrypted_column_metadata` is present.
841    /// This field is only used at write time and is not needed when reading metadata.
842    #[cfg(feature = "encryption")]
843    plaintext_footer_mode: bool,
844}
845
846/// Histograms for repetition and definition levels.
847///
848/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
849/// values at level `i`.
850///
851/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
852/// number of rows with level 1, and so on.
853///
854#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
855pub struct LevelHistogram {
856    inner: Vec<i64>,
857}
858
859impl LevelHistogram {
860    /// Creates a new level histogram data.
861    ///
862    /// Length will be `max_level + 1`.
863    ///
864    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
865    pub fn try_new(max_level: i16) -> Option<Self> {
866        if max_level > 0 {
867            Some(Self {
868                inner: vec![0; max_level as usize + 1],
869            })
870        } else {
871            None
872        }
873    }
874    /// Returns a reference to the the histogram's values.
875    pub fn values(&self) -> &[i64] {
876        &self.inner
877    }
878
879    /// Return the inner vector, consuming self
880    pub fn into_inner(self) -> Vec<i64> {
881        self.inner
882    }
883
884    /// Returns the histogram value at the given index.
885    ///
886    /// The value of `i` is the number of values with level `i`. For example,
887    /// `get(1)` returns the number of values with level 1.
888    ///
889    /// Returns `None` if the index is out of bounds.
890    pub fn get(&self, index: usize) -> Option<i64> {
891        self.inner.get(index).copied()
892    }
893
894    /// Adds the values from the other histogram to this histogram
895    ///
896    /// # Panics
897    /// If the histograms have different lengths
898    pub fn add(&mut self, other: &Self) {
899        assert_eq!(self.len(), other.len());
900        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
901            *dst += src;
902        }
903    }
904
905    /// return the length of the histogram
906    pub fn len(&self) -> usize {
907        self.inner.len()
908    }
909
910    /// returns if the histogram is empty
911    pub fn is_empty(&self) -> bool {
912        self.inner.is_empty()
913    }
914
915    /// Sets the values of all histogram levels to 0.
916    pub fn reset(&mut self) {
917        for value in self.inner.iter_mut() {
918            *value = 0;
919        }
920    }
921
922    /// Updates histogram values using provided repetition levels
923    ///
924    /// # Panics
925    /// if any of the levels is greater than the length of the histogram (
926    /// the argument supplied to [`Self::try_new`])
927    pub fn update_from_levels(&mut self, levels: &[i16]) {
928        for &level in levels {
929            self.inner[level as usize] += 1;
930        }
931    }
932}
933
934impl From<Vec<i64>> for LevelHistogram {
935    fn from(inner: Vec<i64>) -> Self {
936        Self { inner }
937    }
938}
939
940impl From<LevelHistogram> for Vec<i64> {
941    fn from(value: LevelHistogram) -> Self {
942        value.into_inner()
943    }
944}
945
946impl HeapSize for LevelHistogram {
947    fn heap_size(&self) -> usize {
948        self.inner.heap_size()
949    }
950}
951
952/// Represents common operations for a column chunk.
953impl ColumnChunkMetaData {
954    /// Returns builder for column chunk metadata.
955    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
956        ColumnChunkMetaDataBuilder::new(column_descr)
957    }
958
959    /// File where the column chunk is stored.
960    ///
961    /// If not set, assumed to belong to the same file as the metadata.
962    /// This path is relative to the current file.
963    pub fn file_path(&self) -> Option<&str> {
964        self.file_path.as_deref()
965    }
966
967    /// Byte offset of `ColumnMetaData` in `file_path()`.
968    ///
969    /// Note that the meaning of this field has been inconsistent between implementations
970    /// so its use has since been deprecated in the Parquet specification. Modern implementations
971    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
972    /// `ColumnChunk` struct.
973    pub fn file_offset(&self) -> i64 {
974        self.file_offset
975    }
976
977    /// Type of this column. Must be primitive.
978    pub fn column_type(&self) -> Type {
979        self.column_descr.physical_type()
980    }
981
982    /// Path (or identifier) of this column.
983    pub fn column_path(&self) -> &ColumnPath {
984        self.column_descr.path()
985    }
986
987    /// Descriptor for this column.
988    pub fn column_descr(&self) -> &ColumnDescriptor {
989        self.column_descr.as_ref()
990    }
991
992    /// Reference counted clone of descriptor for this column.
993    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
994        self.column_descr.clone()
995    }
996
997    /// All encodings used for this column.
998    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
999        self.encodings.encodings()
1000    }
1001
1002    /// All encodings used for this column, returned as a bitmask.
1003    pub fn encodings_mask(&self) -> &EncodingMask {
1004        &self.encodings
1005    }
1006
1007    /// Total number of values in this column chunk.
1008    pub fn num_values(&self) -> i64 {
1009        self.num_values
1010    }
1011
1012    /// Compression for this column.
1013    pub fn compression(&self) -> Compression {
1014        self.compression
1015    }
1016
1017    /// Returns the total compressed data size of this column chunk.
1018    pub fn compressed_size(&self) -> i64 {
1019        self.total_compressed_size
1020    }
1021
1022    /// Returns the total uncompressed data size of this column chunk.
1023    pub fn uncompressed_size(&self) -> i64 {
1024        self.total_uncompressed_size
1025    }
1026
1027    /// Returns the offset for the column data.
1028    pub fn data_page_offset(&self) -> i64 {
1029        self.data_page_offset
1030    }
1031
1032    /// Returns the offset for the index page.
1033    pub fn index_page_offset(&self) -> Option<i64> {
1034        self.index_page_offset
1035    }
1036
1037    /// Returns the offset for the dictionary page, if any.
1038    pub fn dictionary_page_offset(&self) -> Option<i64> {
1039        self.dictionary_page_offset
1040    }
1041
1042    /// Returns the offset and length in bytes of the column chunk within the file
1043    pub fn byte_range(&self) -> (u64, u64) {
1044        let col_start = match self.dictionary_page_offset() {
1045            Some(dictionary_page_offset) => dictionary_page_offset,
1046            None => self.data_page_offset(),
1047        };
1048        let col_len = self.compressed_size();
1049        assert!(
1050            col_start >= 0 && col_len >= 0,
1051            "column start and length should not be negative"
1052        );
1053        (col_start as u64, col_len as u64)
1054    }
1055
1056    /// Returns statistics that are set for this column chunk,
1057    /// or `None` if no statistics are available.
1058    pub fn statistics(&self) -> Option<&Statistics> {
1059        self.statistics.as_ref()
1060    }
1061
1062    /// Returns geospatial statistics that are set for this column chunk,
1063    /// or `None` if no geospatial statistics are available.
1064    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1065        self.geo_statistics.as_deref()
1066    }
1067
1068    /// Returns the page encoding statistics, or `None` if no page encoding statistics
1069    /// are available (or they were converted to a mask).
1070    ///
1071    /// Note: By default, this crate converts page encoding statistics to a mask for performance
1072    /// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`]
1073    /// to `false`.
1074    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1075        match self.encoding_stats.as_ref() {
1076            Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
1077            _ => None,
1078        }
1079    }
1080
1081    /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
1082    /// not available (or they were left in their original form).
1083    ///
1084    /// Note: This is the default behavior for this crate.
1085    ///
1086    /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
1087    /// enable fast determination of whether all pages in a column chunk are dictionary encoded
1088    /// (see <https://github.com/apache/parquet-format/pull/16>).
1089    /// Decoding the full page encoding statistics, however, can be very costly, and is not
1090    /// necessary to support the aforementioned use case. As an alternative, this crate can
1091    /// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
1092    /// used for data pages
1093    /// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
1094    /// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
1095    ///
1096    /// ```rust
1097    /// use parquet::basic::Encoding;
1098    /// use parquet::file::metadata::ColumnChunkMetaData;
1099    /// // test if all data pages in the column chunk are dictionary encoded
1100    /// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
1101    ///     // check that dictionary encoding was used
1102    ///     col_meta.dictionary_page_offset().is_some()
1103    ///         && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
1104    ///             // mask should only have one bit set, either for PLAIN_DICTIONARY or
1105    ///             // RLE_DICTIONARY
1106    ///             mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
1107    ///         })
1108    /// }
1109    /// ```
1110    pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1111        match self.encoding_stats.as_ref() {
1112            Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
1113            _ => None,
1114        }
1115    }
1116
1117    /// Returns the offset for the bloom filter.
1118    pub fn bloom_filter_offset(&self) -> Option<i64> {
1119        self.bloom_filter_offset
1120    }
1121
1122    /// Returns the offset for the bloom filter.
1123    pub fn bloom_filter_length(&self) -> Option<i32> {
1124        self.bloom_filter_length
1125    }
1126
1127    /// Returns the offset for the column index.
1128    pub fn column_index_offset(&self) -> Option<i64> {
1129        self.column_index_offset
1130    }
1131
1132    /// Returns the offset for the column index length.
1133    pub fn column_index_length(&self) -> Option<i32> {
1134        self.column_index_length
1135    }
1136
1137    /// Returns the range for the offset index if any
1138    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1139        let offset = u64::try_from(self.column_index_offset?).ok()?;
1140        let length = u64::try_from(self.column_index_length?).ok()?;
1141        Some(offset..(offset + length))
1142    }
1143
1144    /// Returns the offset for the offset index.
1145    pub fn offset_index_offset(&self) -> Option<i64> {
1146        self.offset_index_offset
1147    }
1148
1149    /// Returns the offset for the offset index length.
1150    pub fn offset_index_length(&self) -> Option<i32> {
1151        self.offset_index_length
1152    }
1153
1154    /// Returns the range for the offset index if any
1155    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1156        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1157        let length = u64::try_from(self.offset_index_length?).ok()?;
1158        Some(offset..(offset + length))
1159    }
1160
1161    /// Returns the number of bytes of variable length data after decoding.
1162    ///
1163    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1164    /// writers.
1165    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1166        self.unencoded_byte_array_data_bytes
1167    }
1168
1169    /// Returns the repetition level histogram.
1170    ///
1171    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1172    /// `vec[0]` indicates how many rows the page contains.
1173    /// This field may not be set by older writers.
1174    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1175        self.repetition_level_histogram.as_ref()
1176    }
1177
1178    /// Returns the definition level histogram.
1179    ///
1180    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1181    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1182    /// This field may not be set by older writers.
1183    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1184        self.definition_level_histogram.as_ref()
1185    }
1186
1187    /// Returns the encryption metadata for this column chunk.
1188    #[cfg(feature = "encryption")]
1189    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1190        self.column_crypto_metadata.as_deref()
1191    }
1192
1193    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1194    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1195        ColumnChunkMetaDataBuilder::from(self)
1196    }
1197}
1198
1199/// Builder for [`ColumnChunkMetaData`]
1200///
1201/// This builder is used to create a new column chunk metadata or modify an
1202/// existing one.
1203///
1204/// # Example
1205/// ```no_run
1206/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1207/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1208/// let column_chunk_metadata = get_column_chunk_metadata();
1209/// // create a new builder from existing column chunk metadata
1210/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1211/// // clear the statistics:
1212/// let column_chunk_metadata: ColumnChunkMetaData = builder
1213///   .clear_statistics()
1214///   .build()
1215///   .unwrap();
1216/// ```
1217pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1218
1219impl ColumnChunkMetaDataBuilder {
1220    /// Creates new column chunk metadata builder.
1221    ///
1222    /// See also [`ColumnChunkMetaData::builder`]
1223    fn new(column_descr: ColumnDescPtr) -> Self {
1224        Self(ColumnChunkMetaData {
1225            column_descr,
1226            encodings: Default::default(),
1227            file_path: None,
1228            file_offset: 0,
1229            num_values: 0,
1230            compression: Compression::UNCOMPRESSED,
1231            total_compressed_size: 0,
1232            total_uncompressed_size: 0,
1233            data_page_offset: 0,
1234            index_page_offset: None,
1235            dictionary_page_offset: None,
1236            statistics: None,
1237            geo_statistics: None,
1238            encoding_stats: None,
1239            bloom_filter_offset: None,
1240            bloom_filter_length: None,
1241            offset_index_offset: None,
1242            offset_index_length: None,
1243            column_index_offset: None,
1244            column_index_length: None,
1245            unencoded_byte_array_data_bytes: None,
1246            repetition_level_histogram: None,
1247            definition_level_histogram: None,
1248            #[cfg(feature = "encryption")]
1249            column_crypto_metadata: None,
1250            #[cfg(feature = "encryption")]
1251            encrypted_column_metadata: None,
1252            #[cfg(feature = "encryption")]
1253            plaintext_footer_mode: false,
1254        })
1255    }
1256
1257    /// Sets list of encodings for this column chunk.
1258    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1259        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1260        self
1261    }
1262
1263    /// Sets the encodings mask for this column chunk.
1264    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1265        self.0.encodings = encodings;
1266        self
1267    }
1268
1269    /// Sets optional file path for this column chunk.
1270    pub fn set_file_path(mut self, value: String) -> Self {
1271        self.0.file_path = Some(value);
1272        self
1273    }
1274
1275    /// Sets number of values.
1276    pub fn set_num_values(mut self, value: i64) -> Self {
1277        self.0.num_values = value;
1278        self
1279    }
1280
1281    /// Sets compression.
1282    pub fn set_compression(mut self, value: Compression) -> Self {
1283        self.0.compression = value;
1284        self
1285    }
1286
1287    /// Sets total compressed size in bytes.
1288    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1289        self.0.total_compressed_size = value;
1290        self
1291    }
1292
1293    /// Sets total uncompressed size in bytes.
1294    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1295        self.0.total_uncompressed_size = value;
1296        self
1297    }
1298
1299    /// Sets data page offset in bytes.
1300    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1301        self.0.data_page_offset = value;
1302        self
1303    }
1304
1305    /// Sets optional dictionary page offset in bytes.
1306    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1307        self.0.dictionary_page_offset = value;
1308        self
1309    }
1310
1311    /// Sets optional index page offset in bytes.
1312    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1313        self.0.index_page_offset = value;
1314        self
1315    }
1316
1317    /// Sets statistics for this column chunk.
1318    pub fn set_statistics(mut self, value: Statistics) -> Self {
1319        self.0.statistics = Some(value);
1320        self
1321    }
1322
1323    /// Sets geospatial statistics for this column chunk.
1324    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1325        self.0.geo_statistics = Some(value);
1326        self
1327    }
1328
1329    /// Clears the statistics for this column chunk.
1330    pub fn clear_statistics(mut self) -> Self {
1331        self.0.statistics = None;
1332        self
1333    }
1334
1335    /// Sets page encoding stats for this column chunk.
1336    ///
1337    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1338    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1339        self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
1340        self
1341    }
1342
1343    /// Sets page encoding stats mask for this column chunk.
1344    ///
1345    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1346    pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1347        self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
1348        self
1349    }
1350
1351    /// Clears the page encoding stats for this column chunk.
1352    pub fn clear_page_encoding_stats(mut self) -> Self {
1353        self.0.encoding_stats = None;
1354        self
1355    }
1356
1357    /// Sets optional bloom filter offset in bytes.
1358    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1359        self.0.bloom_filter_offset = value;
1360        self
1361    }
1362
1363    /// Sets optional bloom filter length in bytes.
1364    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1365        self.0.bloom_filter_length = value;
1366        self
1367    }
1368
1369    /// Sets optional offset index offset in bytes.
1370    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1371        self.0.offset_index_offset = value;
1372        self
1373    }
1374
1375    /// Sets optional offset index length in bytes.
1376    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1377        self.0.offset_index_length = value;
1378        self
1379    }
1380
1381    /// Sets optional column index offset in bytes.
1382    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1383        self.0.column_index_offset = value;
1384        self
1385    }
1386
1387    /// Sets optional column index length in bytes.
1388    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1389        self.0.column_index_length = value;
1390        self
1391    }
1392
1393    /// Sets optional length of variable length data in bytes.
1394    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1395        self.0.unencoded_byte_array_data_bytes = value;
1396        self
1397    }
1398
1399    /// Sets optional repetition level histogram
1400    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1401        self.0.repetition_level_histogram = value;
1402        self
1403    }
1404
1405    /// Sets optional repetition level histogram
1406    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1407        self.0.definition_level_histogram = value;
1408        self
1409    }
1410
1411    #[cfg(feature = "encryption")]
1412    /// Set the encryption metadata for an encrypted column
1413    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1414        self.0.column_crypto_metadata = value.map(Box::new);
1415        self
1416    }
1417
1418    #[cfg(feature = "encryption")]
1419    /// Set the encryption metadata for an encrypted column
1420    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1421        self.0.encrypted_column_metadata = value;
1422        self
1423    }
1424
1425    /// Builds column chunk metadata.
1426    pub fn build(self) -> Result<ColumnChunkMetaData> {
1427        Ok(self.0)
1428    }
1429}
1430
1431/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1432///
1433/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1434/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1435pub struct ColumnIndexBuilder {
1436    column_type: Type,
1437    null_pages: Vec<bool>,
1438    min_values: Vec<Vec<u8>>,
1439    max_values: Vec<Vec<u8>>,
1440    null_counts: Vec<i64>,
1441    boundary_order: BoundaryOrder,
1442    /// contains the concatenation of the histograms of all pages
1443    repetition_level_histograms: Option<Vec<i64>>,
1444    /// contains the concatenation of the histograms of all pages
1445    definition_level_histograms: Option<Vec<i64>>,
1446    /// Is the information in the builder valid?
1447    ///
1448    /// Set to `false` if any entry in the page doesn't have statistics for
1449    /// some reason, so statistics for that page won't be written to the file.
1450    /// This might happen if the page is entirely null, or
1451    /// is a floating point column without any non-nan values
1452    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1453    valid: bool,
1454}
1455
1456impl ColumnIndexBuilder {
1457    /// Creates a new column index builder.
1458    pub fn new(column_type: Type) -> Self {
1459        ColumnIndexBuilder {
1460            column_type,
1461            null_pages: Vec::new(),
1462            min_values: Vec::new(),
1463            max_values: Vec::new(),
1464            null_counts: Vec::new(),
1465            boundary_order: BoundaryOrder::UNORDERED,
1466            repetition_level_histograms: None,
1467            definition_level_histograms: None,
1468            valid: true,
1469        }
1470    }
1471
1472    /// Append statistics for the next page
1473    pub fn append(
1474        &mut self,
1475        null_page: bool,
1476        min_value: Vec<u8>,
1477        max_value: Vec<u8>,
1478        null_count: i64,
1479    ) {
1480        self.null_pages.push(null_page);
1481        self.min_values.push(min_value);
1482        self.max_values.push(max_value);
1483        self.null_counts.push(null_count);
1484    }
1485
1486    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1487    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1488    ///
1489    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1490    pub fn append_histograms(
1491        &mut self,
1492        repetition_level_histogram: &Option<LevelHistogram>,
1493        definition_level_histogram: &Option<LevelHistogram>,
1494    ) {
1495        if !self.valid {
1496            return;
1497        }
1498        if let Some(rep_lvl_hist) = repetition_level_histogram {
1499            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1500            hist.reserve(rep_lvl_hist.len());
1501            hist.extend(rep_lvl_hist.values());
1502        }
1503        if let Some(def_lvl_hist) = definition_level_histogram {
1504            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1505            hist.reserve(def_lvl_hist.len());
1506            hist.extend(def_lvl_hist.values());
1507        }
1508    }
1509
1510    /// Set the boundary order of the column index
1511    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1512        self.boundary_order = boundary_order;
1513    }
1514
1515    /// Mark this column index as invalid
1516    pub fn to_invalid(&mut self) {
1517        self.valid = false;
1518    }
1519
1520    /// Is the information in the builder valid?
1521    pub fn valid(&self) -> bool {
1522        self.valid
1523    }
1524
1525    /// Build and get the column index
1526    ///
1527    /// Note: callers should check [`Self::valid`] before calling this method
1528    pub fn build(self) -> Result<ColumnIndexMetaData> {
1529        Ok(match self.column_type {
1530            Type::BOOLEAN => {
1531                let index = self.build_page_index()?;
1532                ColumnIndexMetaData::BOOLEAN(index)
1533            }
1534            Type::INT32 => {
1535                let index = self.build_page_index()?;
1536                ColumnIndexMetaData::INT32(index)
1537            }
1538            Type::INT64 => {
1539                let index = self.build_page_index()?;
1540                ColumnIndexMetaData::INT64(index)
1541            }
1542            Type::INT96 => {
1543                let index = self.build_page_index()?;
1544                ColumnIndexMetaData::INT96(index)
1545            }
1546            Type::FLOAT => {
1547                let index = self.build_page_index()?;
1548                ColumnIndexMetaData::FLOAT(index)
1549            }
1550            Type::DOUBLE => {
1551                let index = self.build_page_index()?;
1552                ColumnIndexMetaData::DOUBLE(index)
1553            }
1554            Type::BYTE_ARRAY => {
1555                let index = self.build_byte_array_index()?;
1556                ColumnIndexMetaData::BYTE_ARRAY(index)
1557            }
1558            Type::FIXED_LEN_BYTE_ARRAY => {
1559                let index = self.build_byte_array_index()?;
1560                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1561            }
1562        })
1563    }
1564
1565    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1566    where
1567        T: ParquetValueType,
1568    {
1569        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1570        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1571
1572        PrimitiveColumnIndex::try_new(
1573            self.null_pages,
1574            self.boundary_order,
1575            Some(self.null_counts),
1576            self.repetition_level_histograms,
1577            self.definition_level_histograms,
1578            min_values,
1579            max_values,
1580        )
1581    }
1582
1583    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1584        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1585        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1586
1587        ByteArrayColumnIndex::try_new(
1588            self.null_pages,
1589            self.boundary_order,
1590            Some(self.null_counts),
1591            self.repetition_level_histograms,
1592            self.definition_level_histograms,
1593            min_values,
1594            max_values,
1595        )
1596    }
1597}
1598
1599impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1600    fn from(value: ColumnChunkMetaData) -> Self {
1601        ColumnChunkMetaDataBuilder(value)
1602    }
1603}
1604
1605/// Builder for offset index, part of the Parquet [PageIndex].
1606///
1607/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1608pub struct OffsetIndexBuilder {
1609    offset_array: Vec<i64>,
1610    compressed_page_size_array: Vec<i32>,
1611    first_row_index_array: Vec<i64>,
1612    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1613    current_first_row_index: i64,
1614}
1615
1616impl Default for OffsetIndexBuilder {
1617    fn default() -> Self {
1618        Self::new()
1619    }
1620}
1621
1622impl OffsetIndexBuilder {
1623    /// Creates a new offset index builder.
1624    pub fn new() -> Self {
1625        OffsetIndexBuilder {
1626            offset_array: Vec::new(),
1627            compressed_page_size_array: Vec::new(),
1628            first_row_index_array: Vec::new(),
1629            unencoded_byte_array_data_bytes_array: None,
1630            current_first_row_index: 0,
1631        }
1632    }
1633
1634    /// Append the row count of the next page.
1635    pub fn append_row_count(&mut self, row_count: i64) {
1636        let current_page_row_index = self.current_first_row_index;
1637        self.first_row_index_array.push(current_page_row_index);
1638        self.current_first_row_index += row_count;
1639    }
1640
1641    /// Append the offset and size of the next page.
1642    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1643        self.offset_array.push(offset);
1644        self.compressed_page_size_array.push(compressed_page_size);
1645    }
1646
1647    /// Append the unencoded byte array data bytes of the next page.
1648    pub fn append_unencoded_byte_array_data_bytes(
1649        &mut self,
1650        unencoded_byte_array_data_bytes: Option<i64>,
1651    ) {
1652        if let Some(val) = unencoded_byte_array_data_bytes {
1653            self.unencoded_byte_array_data_bytes_array
1654                .get_or_insert(Vec::new())
1655                .push(val);
1656        }
1657    }
1658
1659    /// Build and get the thrift metadata of offset index
1660    pub fn build(self) -> OffsetIndexMetaData {
1661        let locations = self
1662            .offset_array
1663            .iter()
1664            .zip(self.compressed_page_size_array.iter())
1665            .zip(self.first_row_index_array.iter())
1666            .map(|((offset, size), row_index)| PageLocation {
1667                offset: *offset,
1668                compressed_page_size: *size,
1669                first_row_index: *row_index,
1670            })
1671            .collect::<Vec<_>>();
1672        OffsetIndexMetaData {
1673            page_locations: locations,
1674            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1675        }
1676    }
1677}
1678
1679#[cfg(test)]
1680mod tests {
1681    use super::*;
1682    use crate::basic::{PageType, SortOrder};
1683    use crate::file::metadata::thrift::tests::{
1684        read_column_chunk, read_column_chunk_with_options, read_row_group,
1685    };
1686
1687    #[test]
1688    fn test_row_group_metadata_thrift_conversion() {
1689        let schema_descr = get_test_schema_descr();
1690
1691        let mut columns = vec![];
1692        for ptr in schema_descr.columns() {
1693            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1694            columns.push(column);
1695        }
1696        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1697            .set_num_rows(1000)
1698            .set_total_byte_size(2000)
1699            .set_column_metadata(columns)
1700            .set_ordinal(1)
1701            .build()
1702            .unwrap();
1703
1704        let mut buf = Vec::new();
1705        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1706        row_group_meta.write_thrift(&mut writer).unwrap();
1707
1708        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1709
1710        assert_eq!(row_group_res, row_group_meta);
1711    }
1712
1713    #[test]
1714    fn test_row_group_metadata_thrift_conversion_empty() {
1715        let schema_descr = get_test_schema_descr();
1716
1717        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1718
1719        assert!(row_group_meta.is_err());
1720        if let Err(e) = row_group_meta {
1721            assert_eq!(
1722                format!("{e}"),
1723                "Parquet error: Column length mismatch: 2 != 0"
1724            );
1725        }
1726    }
1727
1728    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1729    #[test]
1730    fn test_row_group_metadata_thrift_corrupted() {
1731        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1732            SchemaType::group_type_builder("schema")
1733                .with_fields(vec![
1734                    Arc::new(
1735                        SchemaType::primitive_type_builder("a", Type::INT32)
1736                            .build()
1737                            .unwrap(),
1738                    ),
1739                    Arc::new(
1740                        SchemaType::primitive_type_builder("b", Type::INT32)
1741                            .build()
1742                            .unwrap(),
1743                    ),
1744                ])
1745                .build()
1746                .unwrap(),
1747        )));
1748
1749        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1750            SchemaType::group_type_builder("schema")
1751                .with_fields(vec![
1752                    Arc::new(
1753                        SchemaType::primitive_type_builder("a", Type::INT32)
1754                            .build()
1755                            .unwrap(),
1756                    ),
1757                    Arc::new(
1758                        SchemaType::primitive_type_builder("b", Type::INT32)
1759                            .build()
1760                            .unwrap(),
1761                    ),
1762                    Arc::new(
1763                        SchemaType::primitive_type_builder("c", Type::INT32)
1764                            .build()
1765                            .unwrap(),
1766                    ),
1767                ])
1768                .build()
1769                .unwrap(),
1770        )));
1771
1772        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1773            .set_num_rows(1000)
1774            .set_total_byte_size(2000)
1775            .set_column_metadata(vec![
1776                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1777                    .build()
1778                    .unwrap(),
1779                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1780                    .build()
1781                    .unwrap(),
1782            ])
1783            .set_ordinal(1)
1784            .build()
1785            .unwrap();
1786        let mut buf = Vec::new();
1787        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1788        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1789
1790        let err = read_row_group(&mut buf, schema_descr_3cols)
1791            .unwrap_err()
1792            .to_string();
1793        assert_eq!(
1794            err,
1795            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1796        );
1797    }
1798
1799    #[test]
1800    fn test_column_chunk_metadata_thrift_conversion() {
1801        let column_descr = get_test_schema_descr().column(0);
1802        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1803            .set_encodings_mask(EncodingMask::new_from_encodings(
1804                [Encoding::PLAIN, Encoding::RLE].iter(),
1805            ))
1806            .set_file_path("file_path".to_owned())
1807            .set_num_values(1000)
1808            .set_compression(Compression::SNAPPY)
1809            .set_total_compressed_size(2000)
1810            .set_total_uncompressed_size(3000)
1811            .set_data_page_offset(4000)
1812            .set_dictionary_page_offset(Some(5000))
1813            .set_page_encoding_stats(vec![
1814                PageEncodingStats {
1815                    page_type: PageType::DATA_PAGE,
1816                    encoding: Encoding::PLAIN,
1817                    count: 3,
1818                },
1819                PageEncodingStats {
1820                    page_type: PageType::DATA_PAGE,
1821                    encoding: Encoding::RLE,
1822                    count: 5,
1823                },
1824            ])
1825            .set_bloom_filter_offset(Some(6000))
1826            .set_bloom_filter_length(Some(25))
1827            .set_offset_index_offset(Some(7000))
1828            .set_offset_index_length(Some(25))
1829            .set_column_index_offset(Some(8000))
1830            .set_column_index_length(Some(25))
1831            .set_unencoded_byte_array_data_bytes(Some(2000))
1832            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1833            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1834            .build()
1835            .unwrap();
1836
1837        let mut buf = Vec::new();
1838        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1839        col_metadata.write_thrift(&mut writer).unwrap();
1840        let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1841
1842        let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1843            .set_encodings_mask(EncodingMask::new_from_encodings(
1844                [Encoding::PLAIN, Encoding::RLE].iter(),
1845            ))
1846            .set_file_path("file_path".to_owned())
1847            .set_num_values(1000)
1848            .set_compression(Compression::SNAPPY)
1849            .set_total_compressed_size(2000)
1850            .set_total_uncompressed_size(3000)
1851            .set_data_page_offset(4000)
1852            .set_dictionary_page_offset(Some(5000))
1853            .set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1854                [Encoding::PLAIN, Encoding::RLE].iter(),
1855            ))
1856            .set_bloom_filter_offset(Some(6000))
1857            .set_bloom_filter_length(Some(25))
1858            .set_offset_index_offset(Some(7000))
1859            .set_offset_index_length(Some(25))
1860            .set_column_index_offset(Some(8000))
1861            .set_column_index_length(Some(25))
1862            .set_unencoded_byte_array_data_bytes(Some(2000))
1863            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1864            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1865            .build()
1866            .unwrap();
1867
1868        assert_eq!(col_chunk_res, expected_metadata);
1869    }
1870
1871    #[test]
1872    fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1873        let column_descr = get_test_schema_descr().column(0);
1874        let stats = vec![
1875            PageEncodingStats {
1876                page_type: PageType::DATA_PAGE,
1877                encoding: Encoding::PLAIN,
1878                count: 3,
1879            },
1880            PageEncodingStats {
1881                page_type: PageType::DATA_PAGE,
1882                encoding: Encoding::RLE,
1883                count: 5,
1884            },
1885        ];
1886        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1887            .set_encodings_mask(EncodingMask::new_from_encodings(
1888                [Encoding::PLAIN, Encoding::RLE].iter(),
1889            ))
1890            .set_num_values(1000)
1891            .set_compression(Compression::SNAPPY)
1892            .set_total_compressed_size(2000)
1893            .set_total_uncompressed_size(3000)
1894            .set_data_page_offset(4000)
1895            .set_page_encoding_stats(stats)
1896            .build()
1897            .unwrap();
1898
1899        let mut buf = Vec::new();
1900        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1901        col_metadata.write_thrift(&mut writer).unwrap();
1902
1903        let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1904        let col_chunk_res =
1905            read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
1906
1907        assert_eq!(col_chunk_res, col_metadata);
1908    }
1909
1910    #[test]
1911    fn test_column_chunk_metadata_thrift_conversion_empty() {
1912        let column_descr = get_test_schema_descr().column(0);
1913
1914        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1915            .build()
1916            .unwrap();
1917
1918        let mut buf = Vec::new();
1919        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1920        col_metadata.write_thrift(&mut writer).unwrap();
1921        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1922
1923        assert_eq!(col_chunk_res, col_metadata);
1924    }
1925
1926    #[test]
1927    fn test_compressed_size() {
1928        let schema_descr = get_test_schema_descr();
1929
1930        let mut columns = vec![];
1931        for column_descr in schema_descr.columns() {
1932            let column = ColumnChunkMetaData::builder(column_descr.clone())
1933                .set_total_compressed_size(500)
1934                .set_total_uncompressed_size(700)
1935                .build()
1936                .unwrap();
1937            columns.push(column);
1938        }
1939        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1940            .set_num_rows(1000)
1941            .set_column_metadata(columns)
1942            .build()
1943            .unwrap();
1944
1945        let compressed_size_res: i64 = row_group_meta.compressed_size();
1946        let compressed_size_exp: i64 = 1000;
1947
1948        assert_eq!(compressed_size_res, compressed_size_exp);
1949    }
1950
1951    #[test]
1952    fn test_memory_size() {
1953        let schema_descr = get_test_schema_descr();
1954
1955        let columns = schema_descr
1956            .columns()
1957            .iter()
1958            .map(|column_descr| {
1959                ColumnChunkMetaData::builder(column_descr.clone())
1960                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1961                    .build()
1962            })
1963            .collect::<Result<Vec<_>>>()
1964            .unwrap();
1965        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1966            .set_num_rows(1000)
1967            .set_column_metadata(columns)
1968            .build()
1969            .unwrap();
1970        let row_group_meta = vec![row_group_meta];
1971
1972        let version = 2;
1973        let num_rows = 1000;
1974        let created_by = Some(String::from("test harness"));
1975        let key_value_metadata = Some(vec![KeyValue::new(
1976            String::from("Foo"),
1977            Some(String::from("bar")),
1978        )]);
1979        let column_orders = Some(vec![
1980            ColumnOrder::UNDEFINED,
1981            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1982        ]);
1983        let file_metadata = FileMetaData::new(
1984            version,
1985            num_rows,
1986            created_by,
1987            key_value_metadata,
1988            schema_descr.clone(),
1989            column_orders,
1990        );
1991
1992        // Now, add in Exact Statistics
1993        let columns_with_stats = schema_descr
1994            .columns()
1995            .iter()
1996            .map(|column_descr| {
1997                ColumnChunkMetaData::builder(column_descr.clone())
1998                    .set_statistics(Statistics::new::<i32>(
1999                        Some(0),
2000                        Some(100),
2001                        None,
2002                        None,
2003                        false,
2004                    ))
2005                    .build()
2006            })
2007            .collect::<Result<Vec<_>>>()
2008            .unwrap();
2009
2010        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
2011            .set_num_rows(1000)
2012            .set_column_metadata(columns_with_stats)
2013            .build()
2014            .unwrap();
2015        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
2016
2017        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
2018            .set_row_groups(row_group_meta_with_stats)
2019            .build();
2020
2021        #[cfg(not(feature = "encryption"))]
2022        let base_expected_size = 2766;
2023        #[cfg(feature = "encryption")]
2024        let base_expected_size = 2934;
2025
2026        assert_eq!(parquet_meta.memory_size(), base_expected_size);
2027
2028        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
2029        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2030        let column_index = column_index.build().unwrap();
2031        let native_index = match column_index {
2032            ColumnIndexMetaData::BOOLEAN(index) => index,
2033            _ => panic!("wrong type of column index"),
2034        };
2035
2036        // Now, add in OffsetIndex
2037        let mut offset_index = OffsetIndexBuilder::new();
2038        offset_index.append_row_count(1);
2039        offset_index.append_offset_and_size(2, 3);
2040        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2041        offset_index.append_row_count(1);
2042        offset_index.append_offset_and_size(2, 3);
2043        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2044        let offset_index = offset_index.build();
2045
2046        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2047            .set_row_groups(row_group_meta)
2048            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
2049            .set_offset_index(Some(vec![vec![offset_index]]))
2050            .build();
2051
2052        #[cfg(not(feature = "encryption"))]
2053        let bigger_expected_size = 3192;
2054        #[cfg(feature = "encryption")]
2055        let bigger_expected_size = 3360;
2056
2057        // more set fields means more memory usage
2058        assert!(bigger_expected_size > base_expected_size);
2059        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2060    }
2061
2062    #[test]
2063    #[cfg(feature = "encryption")]
2064    fn test_memory_size_with_decryptor() {
2065        use crate::encryption::decrypt::FileDecryptionProperties;
2066        use crate::file::metadata::thrift::encryption::AesGcmV1;
2067
2068        let schema_descr = get_test_schema_descr();
2069
2070        let columns = schema_descr
2071            .columns()
2072            .iter()
2073            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
2074            .collect::<Result<Vec<_>>>()
2075            .unwrap();
2076        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
2077            .set_num_rows(1000)
2078            .set_column_metadata(columns)
2079            .build()
2080            .unwrap();
2081        let row_group_meta = vec![row_group_meta];
2082
2083        let version = 2;
2084        let num_rows = 1000;
2085        let aad_file_unique = vec![1u8; 8];
2086        let aad_prefix = vec![2u8; 8];
2087        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
2088            aad_prefix: Some(aad_prefix.clone()),
2089            aad_file_unique: Some(aad_file_unique.clone()),
2090            supply_aad_prefix: Some(true),
2091        });
2092        let footer_key_metadata = Some(vec![3u8; 8]);
2093        let file_metadata =
2094            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
2095                .with_encryption_algorithm(Some(encryption_algorithm))
2096                .with_footer_signing_key_metadata(footer_key_metadata.clone());
2097
2098        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2099            .set_row_groups(row_group_meta.clone())
2100            .build();
2101
2102        let base_expected_size = 2058;
2103        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
2104
2105        let footer_key = "0123456789012345".as_bytes();
2106        let column_key = "1234567890123450".as_bytes();
2107        let mut decryption_properties_builder =
2108            FileDecryptionProperties::builder(footer_key.to_vec())
2109                .with_aad_prefix(aad_prefix.clone());
2110        for column in schema_descr.columns() {
2111            decryption_properties_builder = decryption_properties_builder
2112                .with_column_key(&column.path().string(), column_key.to_vec());
2113        }
2114        let decryption_properties = decryption_properties_builder.build().unwrap();
2115        let decryptor = FileDecryptor::new(
2116            &decryption_properties,
2117            footer_key_metadata.as_deref(),
2118            aad_file_unique,
2119            aad_prefix,
2120        )
2121        .unwrap();
2122
2123        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2124            .set_row_groups(row_group_meta.clone())
2125            .set_file_decryptor(Some(decryptor))
2126            .build();
2127
2128        let expected_size_with_decryptor = 3072;
2129        assert!(expected_size_with_decryptor > base_expected_size);
2130
2131        assert_eq!(
2132            parquet_meta_data.memory_size(),
2133            expected_size_with_decryptor
2134        );
2135    }
2136
2137    /// Returns sample schema descriptor so we can create column metadata.
2138    fn get_test_schema_descr() -> SchemaDescPtr {
2139        let schema = SchemaType::group_type_builder("schema")
2140            .with_fields(vec![
2141                Arc::new(
2142                    SchemaType::primitive_type_builder("a", Type::INT32)
2143                        .build()
2144                        .unwrap(),
2145                ),
2146                Arc::new(
2147                    SchemaType::primitive_type_builder("b", Type::INT32)
2148                        .build()
2149                        .unwrap(),
2150                ),
2151            ])
2152            .build()
2153            .unwrap();
2154
2155        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2156    }
2157}