Skip to main content

parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//! # Examples
46//!
47//! Please see [`external_metadata.rs`]
48//!
49//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
50//!
51//! # Metadata Encodings and Structures
52//!
53//! There are three different encodings of Parquet Metadata in this crate:
54//!
55//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
56//!    [parquet.thrift]
57//!
58//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
59//!    from [parquet.thrift]. These structures are low level and mirror
60//!    the thrift definitions.
61//!
62//! 3. [`file::metadata`] (this module): Easier to use Rust structures
63//!    with a more idiomatic API. Note that, confusingly, some but not all
64//!    of these structures have the same name as the [`format`] structures.
65//!
66//! [`file::metadata`]: crate::file::metadata
67//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
68//!
69//! Graphically, this is how the different structures relate to each other:
70//!
71//! ```text
72//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
73//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
74//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
75//!                            └──────────────┘     │         └───────────────────────┘ │
76//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
77//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
78//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
79//!                                     ...         │                   ...             │
80//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
81//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
82//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
83//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
84//!
85//!                          format::meta structures          file::metadata structures
86//!
87//!                         * Same name, different struct
88//! ```
89mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{
99    BoundaryOrder, ColumnOrder, Compression, CompressionCodec, Encoding, EncodingMask, PageType,
100    Type,
101};
102#[cfg(feature = "encryption")]
103use crate::encryption::decrypt::FileDecryptor;
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
107pub(crate) use crate::file::metadata::memory::HeapSize;
108#[cfg(feature = "encryption")]
109use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
110use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
111use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
112use crate::file::statistics::Statistics;
113use crate::geospatial::statistics as geo_statistics;
114use crate::parquet_thrift::{
115    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
116    WriteThrift, WriteThriftField,
117};
118use crate::schema::types::{
119    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
120    Type as SchemaType,
121};
122use crate::thrift_struct;
123use crate::{
124    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
125};
126
127pub use footer_tail::FooterTail;
128pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
129pub use push_decoder::ParquetMetaDataPushDecoder;
130pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
131use std::io::Write;
132use std::ops::Range;
133use std::sync::Arc;
134pub use writer::ParquetMetaDataWriter;
135pub(crate) use writer::ThriftMetadataWriter;
136
137/// Page level statistics for each column chunk of each row group.
138///
139/// This structure is an in-memory representation of multiple [`ColumnIndex`]
140/// structures in a parquet file footer, as described in the Parquet [PageIndex
141/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
142/// particular column chunk.
143///
144/// `column_index[row_group_number][column_number]` holds the
145/// [`ColumnIndex`] corresponding to column `column_number` of row group
146/// `row_group_number`.
147///
148/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
149/// column in the third row group of the parquet file.
150///
151/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
152/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
153pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
154
155/// [`OffsetIndexMetaData`] for each data page of each row group of each column
156///
157/// This structure is the parsed representation of the [`OffsetIndex`] from the
158/// Parquet file footer, as described in the Parquet [PageIndex documentation].
159///
160/// `offset_index[row_group_number][column_number]` holds
161/// the [`OffsetIndexMetaData`] corresponding to column
162/// `column_number`of row group `row_group_number`.
163///
164/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
165/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
166pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
167
168/// Parsed metadata for a single Parquet file
169///
170/// This structure is stored in the footer of Parquet files, in the format
171/// defined by [`parquet.thrift`].
172///
173/// # Overview
174/// The fields of this structure are:
175/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
176/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
177/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
178///
179/// This structure is read by the various readers in this crate or can be read
180/// directly from a file using the [`ParquetMetaDataReader`] struct.
181///
182/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
183///
184/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
185#[derive(Debug, Clone, PartialEq)]
186pub struct ParquetMetaData {
187    /// File level metadata
188    file_metadata: FileMetaData,
189    /// Row group metadata
190    row_groups: Vec<RowGroupMetaData>,
191    /// Page level index for each page in each column chunk
192    column_index: Option<ParquetColumnIndex>,
193    /// Offset index for each page in each column chunk
194    offset_index: Option<ParquetOffsetIndex>,
195    /// Optional file decryptor
196    #[cfg(feature = "encryption")]
197    file_decryptor: Option<Box<FileDecryptor>>,
198}
199
200impl ParquetMetaData {
201    /// Creates Parquet metadata from file metadata and a list of row
202    /// group metadata
203    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
204        ParquetMetaData {
205            file_metadata,
206            row_groups,
207            column_index: None,
208            offset_index: None,
209            #[cfg(feature = "encryption")]
210            file_decryptor: None,
211        }
212    }
213
214    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
215    /// encrypted data.
216    #[cfg(feature = "encryption")]
217    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
218        self.file_decryptor = file_decryptor.map(Box::new);
219    }
220
221    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
222    pub fn into_builder(self) -> ParquetMetaDataBuilder {
223        self.into()
224    }
225
226    /// Returns file metadata as reference.
227    pub fn file_metadata(&self) -> &FileMetaData {
228        &self.file_metadata
229    }
230
231    /// Returns file decryptor as reference.
232    #[cfg(feature = "encryption")]
233    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
234        self.file_decryptor.as_deref()
235    }
236
237    /// Returns number of row groups in this file.
238    pub fn num_row_groups(&self) -> usize {
239        self.row_groups.len()
240    }
241
242    /// Returns row group metadata for `i`th position.
243    /// Position should be less than number of row groups `num_row_groups`.
244    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
245        &self.row_groups[i]
246    }
247
248    /// Returns slice of row groups in this file.
249    pub fn row_groups(&self) -> &[RowGroupMetaData] {
250        &self.row_groups
251    }
252
253    /// Returns the column index for this file if loaded
254    ///
255    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
256    /// [ArrowReaderOptions::with_page_index] was set to false.
257    ///
258    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
259    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
260        self.column_index.as_ref()
261    }
262
263    /// Returns offset indexes in this file, if loaded
264    ///
265    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
266    /// [ArrowReaderOptions::with_page_index] was set to false.
267    ///
268    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
269    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
270        self.offset_index.as_ref()
271    }
272
273    /// Estimate of the bytes allocated to store `ParquetMetadata`
274    ///
275    /// # Notes:
276    ///
277    /// 1. Includes size of self
278    ///
279    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
280    ///    [`RowGroupMetaData`].
281    ///
282    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
283    ///    means `memory_size` will over estimate the memory size if such pointers
284    ///    are shared.
285    ///
286    /// 4. Does not include any allocator overheads
287    pub fn memory_size(&self) -> usize {
288        #[cfg(feature = "encryption")]
289        let encryption_size = self.file_decryptor.heap_size();
290        #[cfg(not(feature = "encryption"))]
291        let encryption_size = 0usize;
292
293        std::mem::size_of::<Self>()
294            + self.file_metadata.heap_size()
295            + self.row_groups.heap_size()
296            + self.column_index.heap_size()
297            + self.offset_index.heap_size()
298            + encryption_size
299    }
300
301    /// Override the column index
302    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
303        self.column_index = index;
304    }
305
306    /// Override the offset index
307    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
308        self.offset_index = index;
309    }
310}
311
312/// A builder for creating / manipulating [`ParquetMetaData`]
313///
314/// # Example creating a new [`ParquetMetaData`]
315///
316///```no_run
317/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
318/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
319/// // Create a new builder given the file metadata
320/// let file_metadata = get_file_metadata();
321/// // Create a row group
322/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
323///    .set_num_rows(100)
324///    // ... (A real row group needs more than just the number of rows)
325///    .build()
326///    .unwrap();
327/// // Create the final metadata
328/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
329///   .add_row_group(row_group)
330///   .build();
331/// ```
332///
333/// # Example modifying an existing [`ParquetMetaData`]
334/// ```no_run
335/// # use parquet::file::metadata::ParquetMetaData;
336/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
337/// // Modify the metadata so only the last RowGroup remains
338/// let metadata: ParquetMetaData = load_metadata();
339/// let mut builder = metadata.into_builder();
340///
341/// // Take existing row groups to modify
342/// let mut row_groups = builder.take_row_groups();
343/// let last_row_group = row_groups.pop().unwrap();
344///
345/// let metadata = builder
346///   .add_row_group(last_row_group)
347///   .build();
348/// ```
349pub struct ParquetMetaDataBuilder(ParquetMetaData);
350
351impl ParquetMetaDataBuilder {
352    /// Create a new builder from a file metadata, with no row groups
353    pub fn new(file_meta_data: FileMetaData) -> Self {
354        Self(ParquetMetaData::new(file_meta_data, vec![]))
355    }
356
357    /// Create a new builder from an existing ParquetMetaData
358    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
359        Self(metadata)
360    }
361
362    /// Adds a row group to the metadata
363    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
364        self.0.row_groups.push(row_group);
365        self
366    }
367
368    /// Sets all the row groups to the specified list
369    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
370        self.0.row_groups = row_groups;
371        self
372    }
373
374    /// Takes ownership of the row groups in this builder, and clears the list
375    /// of row groups.
376    ///
377    /// This can be used for more efficient creation of a new ParquetMetaData
378    /// from an existing one.
379    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
380        std::mem::take(&mut self.0.row_groups)
381    }
382
383    /// Return a reference to the current row groups
384    pub fn row_groups(&self) -> &[RowGroupMetaData] {
385        &self.0.row_groups
386    }
387
388    /// Sets the column index
389    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
390        self.0.column_index = column_index;
391        self
392    }
393
394    /// Returns the current column index from the builder, replacing it with `None`
395    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
396        std::mem::take(&mut self.0.column_index)
397    }
398
399    /// Return a reference to the current column index, if any
400    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
401        self.0.column_index.as_ref()
402    }
403
404    /// Sets the offset index
405    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
406        self.0.offset_index = offset_index;
407        self
408    }
409
410    /// Returns the current offset index from the builder, replacing it with `None`
411    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
412        std::mem::take(&mut self.0.offset_index)
413    }
414
415    /// Return a reference to the current offset index, if any
416    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
417        self.0.offset_index.as_ref()
418    }
419
420    /// Sets the file decryptor needed to decrypt this metadata.
421    #[cfg(feature = "encryption")]
422    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
423        self.0.with_file_decryptor(file_decryptor);
424        self
425    }
426
427    /// Creates a new ParquetMetaData from the builder
428    pub fn build(self) -> ParquetMetaData {
429        let Self(metadata) = self;
430        metadata
431    }
432}
433
434impl From<ParquetMetaData> for ParquetMetaDataBuilder {
435    fn from(meta_data: ParquetMetaData) -> Self {
436        Self(meta_data)
437    }
438}
439
440thrift_struct!(
441/// A key-value pair for [`FileMetaData`].
442pub struct KeyValue {
443  1: required string key
444  2: optional string value
445}
446);
447
448impl KeyValue {
449    /// Create a new key value pair
450    pub fn new<F2>(key: String, value: F2) -> KeyValue
451    where
452        F2: Into<Option<String>>,
453    {
454        KeyValue {
455            key,
456            value: value.into(),
457        }
458    }
459}
460
461thrift_struct!(
462/// PageEncodingStats for a column chunk and data page.
463pub struct PageEncodingStats {
464  1: required PageType page_type;
465  2: required Encoding encoding;
466  3: required i32 count;
467}
468);
469
470/// Internal representation of the page encoding stats in the [`ColumnChunkMetaData`].
471/// This is not publicly exposed, with different getters defined for each variant.
472#[derive(Debug, Clone, PartialEq)]
473enum ParquetPageEncodingStats {
474    /// The full array of stats as defined in the Parquet spec.
475    Full(Vec<PageEncodingStats>),
476    /// A condensed version of only page encodings seen.
477    Mask(EncodingMask),
478}
479
480/// Reference counted pointer for [`FileMetaData`].
481pub type FileMetaDataPtr = Arc<FileMetaData>;
482
483/// File level metadata for a Parquet file.
484///
485/// Includes the version of the file, metadata, number of rows, schema, and column orders
486#[derive(Debug, Clone, PartialEq)]
487pub struct FileMetaData {
488    version: i32,
489    num_rows: i64,
490    created_by: Option<String>,
491    key_value_metadata: Option<Vec<KeyValue>>,
492    schema_descr: SchemaDescPtr,
493    column_orders: Option<Vec<ColumnOrder>>,
494    #[cfg(feature = "encryption")]
495    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
496    #[cfg(feature = "encryption")]
497    footer_signing_key_metadata: Option<Vec<u8>>,
498}
499
500impl FileMetaData {
501    /// Creates new file metadata.
502    pub fn new(
503        version: i32,
504        num_rows: i64,
505        created_by: Option<String>,
506        key_value_metadata: Option<Vec<KeyValue>>,
507        schema_descr: SchemaDescPtr,
508        column_orders: Option<Vec<ColumnOrder>>,
509    ) -> Self {
510        FileMetaData {
511            version,
512            num_rows,
513            created_by,
514            key_value_metadata,
515            schema_descr,
516            column_orders,
517            #[cfg(feature = "encryption")]
518            encryption_algorithm: None,
519            #[cfg(feature = "encryption")]
520            footer_signing_key_metadata: None,
521        }
522    }
523
524    #[cfg(feature = "encryption")]
525    pub(crate) fn with_encryption_algorithm(
526        mut self,
527        encryption_algorithm: Option<EncryptionAlgorithm>,
528    ) -> Self {
529        self.encryption_algorithm = encryption_algorithm.map(Box::new);
530        self
531    }
532
533    #[cfg(feature = "encryption")]
534    pub(crate) fn with_footer_signing_key_metadata(
535        mut self,
536        footer_signing_key_metadata: Option<Vec<u8>>,
537    ) -> Self {
538        self.footer_signing_key_metadata = footer_signing_key_metadata;
539        self
540    }
541
542    /// Returns version of this file.
543    pub fn version(&self) -> i32 {
544        self.version
545    }
546
547    /// Returns number of rows in the file.
548    pub fn num_rows(&self) -> i64 {
549        self.num_rows
550    }
551
552    /// String message for application that wrote this file.
553    ///
554    /// This should have the following format:
555    /// `<application> version <application version> (build <application build hash>)`.
556    ///
557    /// ```shell
558    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
559    /// ```
560    pub fn created_by(&self) -> Option<&str> {
561        self.created_by.as_deref()
562    }
563
564    /// Returns key_value_metadata of this file.
565    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
566        self.key_value_metadata.as_ref()
567    }
568
569    /// Returns Parquet [`Type`] that describes schema in this file.
570    ///
571    /// [`Type`]: crate::schema::types::Type
572    pub fn schema(&self) -> &SchemaType {
573        self.schema_descr.root_schema()
574    }
575
576    /// Returns a reference to schema descriptor.
577    pub fn schema_descr(&self) -> &SchemaDescriptor {
578        &self.schema_descr
579    }
580
581    /// Returns reference counted clone for schema descriptor.
582    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
583        self.schema_descr.clone()
584    }
585
586    /// Column (sort) order used for `min` and `max` values of each column in this file.
587    ///
588    /// Each column order corresponds to one column, determined by its position in the
589    /// list, matching the position of the column in the schema.
590    ///
591    /// When `None` is returned, there are no column orders available, and each column
592    /// should be assumed to have undefined (legacy) column order.
593    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
594        self.column_orders.as_ref()
595    }
596
597    /// Returns column order for `i`th column in this file.
598    /// If column orders are not available, returns undefined (legacy) column order.
599    pub fn column_order(&self, i: usize) -> ColumnOrder {
600        self.column_orders
601            .as_ref()
602            .map(|data| data[i])
603            .unwrap_or(ColumnOrder::UNDEFINED)
604    }
605}
606
607thrift_struct!(
608/// Sort order within a RowGroup of a leaf column
609pub struct SortingColumn {
610  /// The ordinal position of the column (in this row group)
611  1: required i32 column_idx
612
613  /// If true, indicates this column is sorted in descending order.
614  2: required bool descending
615
616  /// If true, nulls will come before non-null values, otherwise,
617  /// nulls go at the end. */
618  3: required bool nulls_first
619}
620);
621
622/// Reference counted pointer for [`RowGroupMetaData`].
623pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
624
625/// Metadata for a row group
626///
627/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
628/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
629#[derive(Debug, Clone, PartialEq)]
630pub struct RowGroupMetaData {
631    columns: Vec<ColumnChunkMetaData>,
632    num_rows: i64,
633    sorting_columns: Option<Vec<SortingColumn>>,
634    total_byte_size: i64,
635    schema_descr: SchemaDescPtr,
636    /// We can't infer from file offset of first column since there may empty columns in row group.
637    file_offset: Option<i64>,
638    /// Ordinal position of this row group in file
639    ordinal: Option<i16>,
640}
641
642impl RowGroupMetaData {
643    /// Returns builder for row group metadata.
644    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
645        RowGroupMetaDataBuilder::new(schema_descr)
646    }
647
648    /// Number of columns in this row group.
649    pub fn num_columns(&self) -> usize {
650        self.columns.len()
651    }
652
653    /// Returns column chunk metadata for `i`th column.
654    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
655        &self.columns[i]
656    }
657
658    /// Returns slice of column chunk metadata.
659    pub fn columns(&self) -> &[ColumnChunkMetaData] {
660        &self.columns
661    }
662
663    /// Returns mutable slice of column chunk metadata.
664    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
665        &mut self.columns
666    }
667
668    /// Number of rows in this row group.
669    pub fn num_rows(&self) -> i64 {
670        self.num_rows
671    }
672
673    /// Returns the sort ordering of the rows in this RowGroup if any
674    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
675        self.sorting_columns.as_ref()
676    }
677
678    /// Total byte size of all uncompressed column data in this row group.
679    pub fn total_byte_size(&self) -> i64 {
680        self.total_byte_size
681    }
682
683    /// Total size of all compressed column data in this row group.
684    pub fn compressed_size(&self) -> i64 {
685        self.columns.iter().map(|c| c.total_compressed_size).sum()
686    }
687
688    /// Returns reference to a schema descriptor.
689    pub fn schema_descr(&self) -> &SchemaDescriptor {
690        self.schema_descr.as_ref()
691    }
692
693    /// Returns reference counted clone of schema descriptor.
694    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
695        self.schema_descr.clone()
696    }
697
698    /// Returns ordinal position of this row group in file.
699    ///
700    /// For example if this is the first row group in the file, this will return 0.
701    /// If this is the second row group in the file, this will return 1.
702    #[inline(always)]
703    pub fn ordinal(&self) -> Option<i16> {
704        self.ordinal
705    }
706
707    /// Returns file offset of this row group in file.
708    #[inline(always)]
709    pub fn file_offset(&self) -> Option<i64> {
710        self.file_offset
711    }
712
713    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
714    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
715        RowGroupMetaDataBuilder(self)
716    }
717}
718
719/// Builder for row group metadata.
720pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
721
722impl RowGroupMetaDataBuilder {
723    /// Creates new builder from schema descriptor.
724    fn new(schema_descr: SchemaDescPtr) -> Self {
725        Self(RowGroupMetaData {
726            columns: Vec::with_capacity(schema_descr.num_columns()),
727            schema_descr,
728            file_offset: None,
729            num_rows: 0,
730            sorting_columns: None,
731            total_byte_size: 0,
732            ordinal: None,
733        })
734    }
735
736    /// Sets number of rows in this row group.
737    pub fn set_num_rows(mut self, value: i64) -> Self {
738        self.0.num_rows = value;
739        self
740    }
741
742    /// Sets the sorting order for columns
743    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
744        self.0.sorting_columns = value;
745        self
746    }
747
748    /// Sets total size in bytes for this row group.
749    pub fn set_total_byte_size(mut self, value: i64) -> Self {
750        self.0.total_byte_size = value;
751        self
752    }
753
754    /// Takes ownership of the the column metadata in this builder, and clears
755    /// the list of columns.
756    ///
757    /// This can be used for more efficient creation of a new RowGroupMetaData
758    /// from an existing one.
759    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
760        std::mem::take(&mut self.0.columns)
761    }
762
763    /// Sets column metadata for this row group.
764    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
765        self.0.columns = value;
766        self
767    }
768
769    /// Adds a column metadata to this row group
770    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
771        self.0.columns.push(value);
772        self
773    }
774
775    /// Sets ordinal for this row group.
776    pub fn set_ordinal(mut self, value: i16) -> Self {
777        self.0.ordinal = Some(value);
778        self
779    }
780
781    /// Sets file offset for this row group.
782    pub fn set_file_offset(mut self, value: i64) -> Self {
783        self.0.file_offset = Some(value);
784        self
785    }
786
787    /// Builds row group metadata.
788    pub fn build(self) -> Result<RowGroupMetaData> {
789        if self.0.schema_descr.num_columns() != self.0.columns.len() {
790            return Err(general_err!(
791                "Column length mismatch: {} != {}",
792                self.0.schema_descr.num_columns(),
793                self.0.columns.len()
794            ));
795        }
796
797        Ok(self.0)
798    }
799
800    /// Build row group metadata without validation.
801    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
802        self.0
803    }
804}
805
806/// Metadata for a column chunk.
807#[derive(Debug, Clone, PartialEq)]
808pub struct ColumnChunkMetaData {
809    column_descr: ColumnDescPtr,
810    encodings: EncodingMask,
811    file_path: Option<String>,
812    file_offset: i64,
813    num_values: i64,
814    compression: CompressionCodec,
815    total_compressed_size: i64,
816    total_uncompressed_size: i64,
817    data_page_offset: i64,
818    index_page_offset: Option<i64>,
819    dictionary_page_offset: Option<i64>,
820    statistics: Option<Statistics>,
821    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
822    encoding_stats: Option<ParquetPageEncodingStats>,
823    bloom_filter_offset: Option<i64>,
824    bloom_filter_length: Option<i32>,
825    offset_index_offset: Option<i64>,
826    offset_index_length: Option<i32>,
827    column_index_offset: Option<i64>,
828    column_index_length: Option<i32>,
829    unencoded_byte_array_data_bytes: Option<i64>,
830    repetition_level_histogram: Option<LevelHistogram>,
831    definition_level_histogram: Option<LevelHistogram>,
832    #[cfg(feature = "encryption")]
833    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
834    #[cfg(feature = "encryption")]
835    encrypted_column_metadata: Option<Vec<u8>>,
836    /// When true, indicates the footer is plaintext (not encrypted).
837    /// This affects how column metadata is serialized when `encrypted_column_metadata` is present.
838    /// This field is only used at write time and is not needed when reading metadata.
839    #[cfg(feature = "encryption")]
840    plaintext_footer_mode: bool,
841}
842
843/// Histograms for repetition and definition levels.
844///
845/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
846/// values at level `i`.
847///
848/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
849/// number of rows with level 1, and so on.
850///
851#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
852pub struct LevelHistogram {
853    inner: Vec<i64>,
854}
855
856impl LevelHistogram {
857    /// Creates a new level histogram data.
858    ///
859    /// Length will be `max_level + 1`.
860    ///
861    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
862    pub fn try_new(max_level: i16) -> Option<Self> {
863        if max_level > 0 {
864            Some(Self {
865                inner: vec![0; max_level as usize + 1],
866            })
867        } else {
868            None
869        }
870    }
871    /// Returns a reference to the the histogram's values.
872    pub fn values(&self) -> &[i64] {
873        &self.inner
874    }
875
876    /// Return the inner vector, consuming self
877    pub fn into_inner(self) -> Vec<i64> {
878        self.inner
879    }
880
881    /// Returns the histogram value at the given index.
882    ///
883    /// The value of `i` is the number of values with level `i`. For example,
884    /// `get(1)` returns the number of values with level 1.
885    ///
886    /// Returns `None` if the index is out of bounds.
887    pub fn get(&self, index: usize) -> Option<i64> {
888        self.inner.get(index).copied()
889    }
890
891    /// Adds the values from the other histogram to this histogram
892    ///
893    /// # Panics
894    /// If the histograms have different lengths
895    pub fn add(&mut self, other: &Self) {
896        assert_eq!(self.len(), other.len());
897        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
898            *dst += src;
899        }
900    }
901
902    /// return the length of the histogram
903    pub fn len(&self) -> usize {
904        self.inner.len()
905    }
906
907    /// returns if the histogram is empty
908    pub fn is_empty(&self) -> bool {
909        self.inner.is_empty()
910    }
911
912    /// Sets the values of all histogram levels to 0.
913    pub fn reset(&mut self) {
914        for value in self.inner.iter_mut() {
915            *value = 0;
916        }
917    }
918
919    /// Increments the count for a level value by `count`.
920    #[inline]
921    pub fn increment_by(&mut self, level: i16, count: i64) {
922        self.inner[level as usize] += count;
923    }
924
925    /// Updates histogram values using provided repetition levels
926    ///
927    /// # Panics
928    /// if any of the levels is greater than the length of the histogram (
929    /// the argument supplied to [`Self::try_new`])
930    #[deprecated(since = "58.2.0", note = "Use `increment_by` instead")]
931    pub fn update_from_levels(&mut self, levels: &[i16]) {
932        for &level in levels {
933            self.increment_by(level, 1);
934        }
935    }
936}
937
938impl From<Vec<i64>> for LevelHistogram {
939    fn from(inner: Vec<i64>) -> Self {
940        Self { inner }
941    }
942}
943
944impl From<LevelHistogram> for Vec<i64> {
945    fn from(value: LevelHistogram) -> Self {
946        value.into_inner()
947    }
948}
949
950impl HeapSize for LevelHistogram {
951    fn heap_size(&self) -> usize {
952        self.inner.heap_size()
953    }
954}
955
956/// Represents common operations for a column chunk.
957impl ColumnChunkMetaData {
958    /// Returns builder for column chunk metadata.
959    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
960        ColumnChunkMetaDataBuilder::new(column_descr)
961    }
962
963    /// File where the column chunk is stored.
964    ///
965    /// If not set, assumed to belong to the same file as the metadata.
966    /// This path is relative to the current file.
967    pub fn file_path(&self) -> Option<&str> {
968        self.file_path.as_deref()
969    }
970
971    /// Byte offset of `ColumnMetaData` in `file_path()`.
972    ///
973    /// Note that the meaning of this field has been inconsistent between implementations
974    /// so its use has since been deprecated in the Parquet specification. Modern implementations
975    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
976    /// `ColumnChunk` struct.
977    pub fn file_offset(&self) -> i64 {
978        self.file_offset
979    }
980
981    /// Type of this column. Must be primitive.
982    pub fn column_type(&self) -> Type {
983        self.column_descr.physical_type()
984    }
985
986    /// Path (or identifier) of this column.
987    pub fn column_path(&self) -> &ColumnPath {
988        self.column_descr.path()
989    }
990
991    /// Descriptor for this column.
992    pub fn column_descr(&self) -> &ColumnDescriptor {
993        self.column_descr.as_ref()
994    }
995
996    /// Reference counted clone of descriptor for this column.
997    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
998        self.column_descr.clone()
999    }
1000
1001    /// All encodings used for this column.
1002    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
1003        self.encodings.encodings()
1004    }
1005
1006    /// All encodings used for this column, returned as a bitmask.
1007    pub fn encodings_mask(&self) -> &EncodingMask {
1008        &self.encodings
1009    }
1010
1011    /// Total number of values in this column chunk.
1012    pub fn num_values(&self) -> i64 {
1013        self.num_values
1014    }
1015
1016    /// [`Compression`] for this column.
1017    ///
1018    /// This is a default value suitable for passing to [`WriterPropertiesBuilder::set_compression`].
1019    /// It is constructed from the `codec` field of the Parquet `ColumnMetaData`
1020    ///
1021    /// [`WriterPropertiesBuilder::set_compression`]: crate::file::properties::WriterPropertiesBuilder
1022    pub fn compression(&self) -> Compression {
1023        self.compression.into()
1024    }
1025
1026    /// Returns the compression codec used when writing this column.
1027    pub fn compression_codec(&self) -> CompressionCodec {
1028        self.compression
1029    }
1030
1031    /// Returns the total compressed data size of this column chunk.
1032    pub fn compressed_size(&self) -> i64 {
1033        self.total_compressed_size
1034    }
1035
1036    /// Returns the total uncompressed data size of this column chunk.
1037    pub fn uncompressed_size(&self) -> i64 {
1038        self.total_uncompressed_size
1039    }
1040
1041    /// Returns the offset for the column data.
1042    pub fn data_page_offset(&self) -> i64 {
1043        self.data_page_offset
1044    }
1045
1046    /// Returns the offset for the index page.
1047    pub fn index_page_offset(&self) -> Option<i64> {
1048        self.index_page_offset
1049    }
1050
1051    /// Returns the offset for the dictionary page, if any.
1052    pub fn dictionary_page_offset(&self) -> Option<i64> {
1053        self.dictionary_page_offset
1054    }
1055
1056    /// Returns the offset and length in bytes of the column chunk within the file
1057    pub fn byte_range(&self) -> (u64, u64) {
1058        let col_start = match self.dictionary_page_offset() {
1059            Some(dictionary_page_offset) => dictionary_page_offset,
1060            None => self.data_page_offset(),
1061        };
1062        let col_len = self.compressed_size();
1063        assert!(
1064            col_start >= 0 && col_len >= 0,
1065            "column start and length should not be negative"
1066        );
1067        (col_start as u64, col_len as u64)
1068    }
1069
1070    /// Returns statistics that are set for this column chunk,
1071    /// or `None` if no statistics are available.
1072    pub fn statistics(&self) -> Option<&Statistics> {
1073        self.statistics.as_ref()
1074    }
1075
1076    /// Returns geospatial statistics that are set for this column chunk,
1077    /// or `None` if no geospatial statistics are available.
1078    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1079        self.geo_statistics.as_deref()
1080    }
1081
1082    /// Returns the page encoding statistics, or `None` if no page encoding statistics
1083    /// are available (or they were converted to a mask).
1084    ///
1085    /// Note: By default, this crate converts page encoding statistics to a mask for performance
1086    /// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`]
1087    /// to `false`.
1088    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1089        match self.encoding_stats.as_ref() {
1090            Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
1091            _ => None,
1092        }
1093    }
1094
1095    /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
1096    /// not available (or they were left in their original form).
1097    ///
1098    /// Note: This is the default behavior for this crate.
1099    ///
1100    /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
1101    /// enable fast determination of whether all pages in a column chunk are dictionary encoded
1102    /// (see <https://github.com/apache/parquet-format/pull/16>).
1103    /// Decoding the full page encoding statistics, however, can be very costly, and is not
1104    /// necessary to support the aforementioned use case. As an alternative, this crate can
1105    /// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
1106    /// used for data pages
1107    /// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
1108    /// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
1109    ///
1110    /// ```rust
1111    /// use parquet::basic::Encoding;
1112    /// use parquet::file::metadata::ColumnChunkMetaData;
1113    /// // test if all data pages in the column chunk are dictionary encoded
1114    /// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
1115    ///     // check that dictionary encoding was used
1116    ///     col_meta.dictionary_page_offset().is_some()
1117    ///         && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
1118    ///             // mask should only have one bit set, either for PLAIN_DICTIONARY or
1119    ///             // RLE_DICTIONARY
1120    ///             mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
1121    ///         })
1122    /// }
1123    /// ```
1124    pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1125        match self.encoding_stats.as_ref() {
1126            Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
1127            _ => None,
1128        }
1129    }
1130
1131    /// Returns the offset for the bloom filter.
1132    pub fn bloom_filter_offset(&self) -> Option<i64> {
1133        self.bloom_filter_offset
1134    }
1135
1136    /// Returns the offset for the bloom filter.
1137    pub fn bloom_filter_length(&self) -> Option<i32> {
1138        self.bloom_filter_length
1139    }
1140
1141    /// Returns the offset for the column index.
1142    pub fn column_index_offset(&self) -> Option<i64> {
1143        self.column_index_offset
1144    }
1145
1146    /// Returns the offset for the column index length.
1147    pub fn column_index_length(&self) -> Option<i32> {
1148        self.column_index_length
1149    }
1150
1151    /// Returns the range for the offset index if any
1152    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1153        let offset = u64::try_from(self.column_index_offset?).ok()?;
1154        let length = u64::try_from(self.column_index_length?).ok()?;
1155        Some(offset..(offset + length))
1156    }
1157
1158    /// Returns the offset for the offset index.
1159    pub fn offset_index_offset(&self) -> Option<i64> {
1160        self.offset_index_offset
1161    }
1162
1163    /// Returns the offset for the offset index length.
1164    pub fn offset_index_length(&self) -> Option<i32> {
1165        self.offset_index_length
1166    }
1167
1168    /// Returns the range for the offset index if any
1169    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1170        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1171        let length = u64::try_from(self.offset_index_length?).ok()?;
1172        Some(offset..(offset + length))
1173    }
1174
1175    /// Returns the number of bytes of variable length data after decoding.
1176    ///
1177    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1178    /// writers.
1179    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1180        self.unencoded_byte_array_data_bytes
1181    }
1182
1183    /// Returns the repetition level histogram.
1184    ///
1185    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1186    /// `vec[0]` indicates how many rows the page contains.
1187    /// This field may not be set by older writers.
1188    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1189        self.repetition_level_histogram.as_ref()
1190    }
1191
1192    /// Returns the definition level histogram.
1193    ///
1194    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1195    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1196    /// This field may not be set by older writers.
1197    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1198        self.definition_level_histogram.as_ref()
1199    }
1200
1201    /// Returns the encryption metadata for this column chunk.
1202    #[cfg(feature = "encryption")]
1203    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1204        self.column_crypto_metadata.as_deref()
1205    }
1206
1207    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1208    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1209        ColumnChunkMetaDataBuilder::from(self)
1210    }
1211}
1212
1213/// Builder for [`ColumnChunkMetaData`]
1214///
1215/// This builder is used to create a new column chunk metadata or modify an
1216/// existing one.
1217///
1218/// # Example
1219/// ```no_run
1220/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1221/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1222/// let column_chunk_metadata = get_column_chunk_metadata();
1223/// // create a new builder from existing column chunk metadata
1224/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1225/// // clear the statistics:
1226/// let column_chunk_metadata: ColumnChunkMetaData = builder
1227///   .clear_statistics()
1228///   .build()
1229///   .unwrap();
1230/// ```
1231pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1232
1233impl ColumnChunkMetaDataBuilder {
1234    /// Creates new column chunk metadata builder.
1235    ///
1236    /// See also [`ColumnChunkMetaData::builder`]
1237    fn new(column_descr: ColumnDescPtr) -> Self {
1238        Self(ColumnChunkMetaData {
1239            column_descr,
1240            encodings: Default::default(),
1241            file_path: None,
1242            file_offset: 0,
1243            num_values: 0,
1244            compression: CompressionCodec::UNCOMPRESSED,
1245            total_compressed_size: 0,
1246            total_uncompressed_size: 0,
1247            data_page_offset: 0,
1248            index_page_offset: None,
1249            dictionary_page_offset: None,
1250            statistics: None,
1251            geo_statistics: None,
1252            encoding_stats: None,
1253            bloom_filter_offset: None,
1254            bloom_filter_length: None,
1255            offset_index_offset: None,
1256            offset_index_length: None,
1257            column_index_offset: None,
1258            column_index_length: None,
1259            unencoded_byte_array_data_bytes: None,
1260            repetition_level_histogram: None,
1261            definition_level_histogram: None,
1262            #[cfg(feature = "encryption")]
1263            column_crypto_metadata: None,
1264            #[cfg(feature = "encryption")]
1265            encrypted_column_metadata: None,
1266            #[cfg(feature = "encryption")]
1267            plaintext_footer_mode: false,
1268        })
1269    }
1270
1271    /// Sets list of encodings for this column chunk.
1272    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1273        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1274        self
1275    }
1276
1277    /// Sets the encodings mask for this column chunk.
1278    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1279        self.0.encodings = encodings;
1280        self
1281    }
1282
1283    /// Sets optional file path for this column chunk.
1284    pub fn set_file_path(mut self, value: String) -> Self {
1285        self.0.file_path = Some(value);
1286        self
1287    }
1288
1289    /// Sets number of values.
1290    pub fn set_num_values(mut self, value: i64) -> Self {
1291        self.0.num_values = value;
1292        self
1293    }
1294
1295    /// Sets compression codec given a [`Compression`] configuration value.
1296    pub fn set_compression(mut self, value: Compression) -> Self {
1297        self.0.compression = value.into();
1298        self
1299    }
1300
1301    /// Sets compression codec.
1302    pub fn set_compression_codec(mut self, value: CompressionCodec) -> Self {
1303        self.0.compression = value;
1304        self
1305    }
1306
1307    /// Sets total compressed size in bytes.
1308    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1309        self.0.total_compressed_size = value;
1310        self
1311    }
1312
1313    /// Sets total uncompressed size in bytes.
1314    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1315        self.0.total_uncompressed_size = value;
1316        self
1317    }
1318
1319    /// Sets data page offset in bytes.
1320    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1321        self.0.data_page_offset = value;
1322        self
1323    }
1324
1325    /// Sets optional dictionary page offset in bytes.
1326    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1327        self.0.dictionary_page_offset = value;
1328        self
1329    }
1330
1331    /// Sets optional index page offset in bytes.
1332    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1333        self.0.index_page_offset = value;
1334        self
1335    }
1336
1337    /// Sets statistics for this column chunk.
1338    pub fn set_statistics(mut self, value: Statistics) -> Self {
1339        self.0.statistics = Some(value);
1340        self
1341    }
1342
1343    /// Sets geospatial statistics for this column chunk.
1344    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1345        self.0.geo_statistics = Some(value);
1346        self
1347    }
1348
1349    /// Clears the statistics for this column chunk.
1350    pub fn clear_statistics(mut self) -> Self {
1351        self.0.statistics = None;
1352        self
1353    }
1354
1355    /// Sets page encoding stats for this column chunk.
1356    ///
1357    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1358    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1359        self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
1360        self
1361    }
1362
1363    /// Sets page encoding stats mask for this column chunk.
1364    ///
1365    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1366    pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1367        self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
1368        self
1369    }
1370
1371    /// Clears the page encoding stats for this column chunk.
1372    pub fn clear_page_encoding_stats(mut self) -> Self {
1373        self.0.encoding_stats = None;
1374        self
1375    }
1376
1377    /// Sets optional bloom filter offset in bytes.
1378    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1379        self.0.bloom_filter_offset = value;
1380        self
1381    }
1382
1383    /// Sets optional bloom filter length in bytes.
1384    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1385        self.0.bloom_filter_length = value;
1386        self
1387    }
1388
1389    /// Sets optional offset index offset in bytes.
1390    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1391        self.0.offset_index_offset = value;
1392        self
1393    }
1394
1395    /// Sets optional offset index length in bytes.
1396    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1397        self.0.offset_index_length = value;
1398        self
1399    }
1400
1401    /// Sets optional column index offset in bytes.
1402    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1403        self.0.column_index_offset = value;
1404        self
1405    }
1406
1407    /// Sets optional column index length in bytes.
1408    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1409        self.0.column_index_length = value;
1410        self
1411    }
1412
1413    /// Sets optional length of variable length data in bytes.
1414    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1415        self.0.unencoded_byte_array_data_bytes = value;
1416        self
1417    }
1418
1419    /// Sets optional repetition level histogram
1420    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1421        self.0.repetition_level_histogram = value;
1422        self
1423    }
1424
1425    /// Sets optional repetition level histogram
1426    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1427        self.0.definition_level_histogram = value;
1428        self
1429    }
1430
1431    #[cfg(feature = "encryption")]
1432    /// Set the encryption metadata for an encrypted column
1433    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1434        self.0.column_crypto_metadata = value.map(Box::new);
1435        self
1436    }
1437
1438    #[cfg(feature = "encryption")]
1439    /// Set the encryption metadata for an encrypted column
1440    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1441        self.0.encrypted_column_metadata = value;
1442        self
1443    }
1444
1445    /// Builds column chunk metadata.
1446    pub fn build(self) -> Result<ColumnChunkMetaData> {
1447        Ok(self.0)
1448    }
1449}
1450
1451/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1452///
1453/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1454/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1455pub struct ColumnIndexBuilder {
1456    column_type: Type,
1457    null_pages: Vec<bool>,
1458    min_values: Vec<Vec<u8>>,
1459    max_values: Vec<Vec<u8>>,
1460    null_counts: Vec<i64>,
1461    boundary_order: BoundaryOrder,
1462    /// contains the concatenation of the histograms of all pages
1463    repetition_level_histograms: Option<Vec<i64>>,
1464    /// contains the concatenation of the histograms of all pages
1465    definition_level_histograms: Option<Vec<i64>>,
1466    /// Is the information in the builder valid?
1467    ///
1468    /// Set to `false` if any entry in the page doesn't have statistics for
1469    /// some reason, so statistics for that page won't be written to the file.
1470    /// This might happen if the page is entirely null, or
1471    /// is a floating point column without any non-nan values
1472    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1473    valid: bool,
1474}
1475
1476impl ColumnIndexBuilder {
1477    /// Creates a new column index builder.
1478    pub fn new(column_type: Type) -> Self {
1479        ColumnIndexBuilder {
1480            column_type,
1481            null_pages: Vec::new(),
1482            min_values: Vec::new(),
1483            max_values: Vec::new(),
1484            null_counts: Vec::new(),
1485            boundary_order: BoundaryOrder::UNORDERED,
1486            repetition_level_histograms: None,
1487            definition_level_histograms: None,
1488            valid: true,
1489        }
1490    }
1491
1492    /// Append statistics for the next page
1493    pub fn append(
1494        &mut self,
1495        null_page: bool,
1496        min_value: Vec<u8>,
1497        max_value: Vec<u8>,
1498        null_count: i64,
1499    ) {
1500        self.null_pages.push(null_page);
1501        self.min_values.push(min_value);
1502        self.max_values.push(max_value);
1503        self.null_counts.push(null_count);
1504    }
1505
1506    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1507    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1508    ///
1509    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1510    pub fn append_histograms(
1511        &mut self,
1512        repetition_level_histogram: &Option<LevelHistogram>,
1513        definition_level_histogram: &Option<LevelHistogram>,
1514    ) {
1515        if !self.valid {
1516            return;
1517        }
1518        if let Some(rep_lvl_hist) = repetition_level_histogram {
1519            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1520            hist.reserve(rep_lvl_hist.len());
1521            hist.extend(rep_lvl_hist.values());
1522        }
1523        if let Some(def_lvl_hist) = definition_level_histogram {
1524            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1525            hist.reserve(def_lvl_hist.len());
1526            hist.extend(def_lvl_hist.values());
1527        }
1528    }
1529
1530    /// Set the boundary order of the column index
1531    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1532        self.boundary_order = boundary_order;
1533    }
1534
1535    /// Mark this column index as invalid
1536    pub fn to_invalid(&mut self) {
1537        self.valid = false;
1538    }
1539
1540    /// Is the information in the builder valid?
1541    pub fn valid(&self) -> bool {
1542        self.valid
1543    }
1544
1545    /// Build and get the column index
1546    ///
1547    /// Note: callers should check [`Self::valid`] before calling this method
1548    pub fn build(self) -> Result<ColumnIndexMetaData> {
1549        Ok(match self.column_type {
1550            Type::BOOLEAN => {
1551                let index = self.build_page_index()?;
1552                ColumnIndexMetaData::BOOLEAN(index)
1553            }
1554            Type::INT32 => {
1555                let index = self.build_page_index()?;
1556                ColumnIndexMetaData::INT32(index)
1557            }
1558            Type::INT64 => {
1559                let index = self.build_page_index()?;
1560                ColumnIndexMetaData::INT64(index)
1561            }
1562            Type::INT96 => {
1563                let index = self.build_page_index()?;
1564                ColumnIndexMetaData::INT96(index)
1565            }
1566            Type::FLOAT => {
1567                let index = self.build_page_index()?;
1568                ColumnIndexMetaData::FLOAT(index)
1569            }
1570            Type::DOUBLE => {
1571                let index = self.build_page_index()?;
1572                ColumnIndexMetaData::DOUBLE(index)
1573            }
1574            Type::BYTE_ARRAY => {
1575                let index = self.build_byte_array_index()?;
1576                ColumnIndexMetaData::BYTE_ARRAY(index)
1577            }
1578            Type::FIXED_LEN_BYTE_ARRAY => {
1579                let index = self.build_byte_array_index()?;
1580                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1581            }
1582        })
1583    }
1584
1585    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1586    where
1587        T: ParquetValueType,
1588    {
1589        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1590        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1591
1592        PrimitiveColumnIndex::try_new(
1593            self.null_pages,
1594            self.boundary_order,
1595            Some(self.null_counts),
1596            self.repetition_level_histograms,
1597            self.definition_level_histograms,
1598            min_values,
1599            max_values,
1600        )
1601    }
1602
1603    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1604        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1605        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1606
1607        ByteArrayColumnIndex::try_new(
1608            self.null_pages,
1609            self.boundary_order,
1610            Some(self.null_counts),
1611            self.repetition_level_histograms,
1612            self.definition_level_histograms,
1613            min_values,
1614            max_values,
1615        )
1616    }
1617}
1618
1619impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1620    fn from(value: ColumnChunkMetaData) -> Self {
1621        ColumnChunkMetaDataBuilder(value)
1622    }
1623}
1624
1625/// Builder for offset index, part of the Parquet [PageIndex].
1626///
1627/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1628pub struct OffsetIndexBuilder {
1629    offset_array: Vec<i64>,
1630    compressed_page_size_array: Vec<i32>,
1631    first_row_index_array: Vec<i64>,
1632    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1633    current_first_row_index: i64,
1634}
1635
1636impl Default for OffsetIndexBuilder {
1637    fn default() -> Self {
1638        Self::new()
1639    }
1640}
1641
1642impl OffsetIndexBuilder {
1643    /// Creates a new offset index builder.
1644    pub fn new() -> Self {
1645        OffsetIndexBuilder {
1646            offset_array: Vec::new(),
1647            compressed_page_size_array: Vec::new(),
1648            first_row_index_array: Vec::new(),
1649            unencoded_byte_array_data_bytes_array: None,
1650            current_first_row_index: 0,
1651        }
1652    }
1653
1654    /// Append the row count of the next page.
1655    pub fn append_row_count(&mut self, row_count: i64) {
1656        let current_page_row_index = self.current_first_row_index;
1657        self.first_row_index_array.push(current_page_row_index);
1658        self.current_first_row_index += row_count;
1659    }
1660
1661    /// Append the offset and size of the next page.
1662    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1663        self.offset_array.push(offset);
1664        self.compressed_page_size_array.push(compressed_page_size);
1665    }
1666
1667    /// Append the unencoded byte array data bytes of the next page.
1668    pub fn append_unencoded_byte_array_data_bytes(
1669        &mut self,
1670        unencoded_byte_array_data_bytes: Option<i64>,
1671    ) {
1672        if let Some(val) = unencoded_byte_array_data_bytes {
1673            self.unencoded_byte_array_data_bytes_array
1674                .get_or_insert(Vec::new())
1675                .push(val);
1676        }
1677    }
1678
1679    /// Build and get the thrift metadata of offset index
1680    pub fn build(self) -> OffsetIndexMetaData {
1681        let locations = self
1682            .offset_array
1683            .iter()
1684            .zip(self.compressed_page_size_array.iter())
1685            .zip(self.first_row_index_array.iter())
1686            .map(|((offset, size), row_index)| PageLocation {
1687                offset: *offset,
1688                compressed_page_size: *size,
1689                first_row_index: *row_index,
1690            })
1691            .collect::<Vec<_>>();
1692        OffsetIndexMetaData {
1693            page_locations: locations,
1694            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1695        }
1696    }
1697}
1698
1699#[cfg(test)]
1700mod tests {
1701    use super::*;
1702    use crate::basic::{PageType, SortOrder};
1703    use crate::file::metadata::thrift::tests::{
1704        read_column_chunk, read_column_chunk_with_options, read_row_group,
1705    };
1706
1707    #[test]
1708    #[allow(deprecated)]
1709    fn test_level_histogram_update_from_levels_compat() {
1710        let mut histogram = LevelHistogram::try_new(2).unwrap();
1711        histogram.update_from_levels(&[0, 2, 1, 2, 2]);
1712        assert_eq!(histogram.values(), &[1, 1, 3]);
1713    }
1714
1715    #[test]
1716    fn test_row_group_metadata_thrift_conversion() {
1717        let schema_descr = get_test_schema_descr();
1718
1719        let mut columns = vec![];
1720        for ptr in schema_descr.columns() {
1721            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1722            columns.push(column);
1723        }
1724        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1725            .set_num_rows(1000)
1726            .set_total_byte_size(2000)
1727            .set_column_metadata(columns)
1728            .set_ordinal(1)
1729            .build()
1730            .unwrap();
1731
1732        let mut buf = Vec::new();
1733        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1734        row_group_meta.write_thrift(&mut writer).unwrap();
1735
1736        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1737
1738        assert_eq!(row_group_res, row_group_meta);
1739    }
1740
1741    #[test]
1742    fn test_row_group_metadata_thrift_conversion_empty() {
1743        let schema_descr = get_test_schema_descr();
1744
1745        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1746
1747        assert!(row_group_meta.is_err());
1748        if let Err(e) = row_group_meta {
1749            assert_eq!(
1750                format!("{e}"),
1751                "Parquet error: Column length mismatch: 2 != 0"
1752            );
1753        }
1754    }
1755
1756    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1757    #[test]
1758    fn test_row_group_metadata_thrift_corrupted() {
1759        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1760            SchemaType::group_type_builder("schema")
1761                .with_fields(vec![
1762                    Arc::new(
1763                        SchemaType::primitive_type_builder("a", Type::INT32)
1764                            .build()
1765                            .unwrap(),
1766                    ),
1767                    Arc::new(
1768                        SchemaType::primitive_type_builder("b", Type::INT32)
1769                            .build()
1770                            .unwrap(),
1771                    ),
1772                ])
1773                .build()
1774                .unwrap(),
1775        )));
1776
1777        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1778            SchemaType::group_type_builder("schema")
1779                .with_fields(vec![
1780                    Arc::new(
1781                        SchemaType::primitive_type_builder("a", Type::INT32)
1782                            .build()
1783                            .unwrap(),
1784                    ),
1785                    Arc::new(
1786                        SchemaType::primitive_type_builder("b", Type::INT32)
1787                            .build()
1788                            .unwrap(),
1789                    ),
1790                    Arc::new(
1791                        SchemaType::primitive_type_builder("c", Type::INT32)
1792                            .build()
1793                            .unwrap(),
1794                    ),
1795                ])
1796                .build()
1797                .unwrap(),
1798        )));
1799
1800        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1801            .set_num_rows(1000)
1802            .set_total_byte_size(2000)
1803            .set_column_metadata(vec![
1804                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1805                    .build()
1806                    .unwrap(),
1807                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1808                    .build()
1809                    .unwrap(),
1810            ])
1811            .set_ordinal(1)
1812            .build()
1813            .unwrap();
1814        let mut buf = Vec::new();
1815        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1816        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1817
1818        let err = read_row_group(&mut buf, schema_descr_3cols)
1819            .unwrap_err()
1820            .to_string();
1821        assert_eq!(
1822            err,
1823            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1824        );
1825    }
1826
1827    #[test]
1828    fn test_column_chunk_metadata_thrift_conversion() {
1829        let column_descr = get_test_schema_descr().column(0);
1830        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1831            .set_encodings_mask(EncodingMask::new_from_encodings(
1832                [Encoding::PLAIN, Encoding::RLE].iter(),
1833            ))
1834            .set_file_path("file_path".to_owned())
1835            .set_num_values(1000)
1836            .set_compression_codec(CompressionCodec::SNAPPY)
1837            .set_total_compressed_size(2000)
1838            .set_total_uncompressed_size(3000)
1839            .set_data_page_offset(4000)
1840            .set_dictionary_page_offset(Some(5000))
1841            .set_page_encoding_stats(vec![
1842                PageEncodingStats {
1843                    page_type: PageType::DATA_PAGE,
1844                    encoding: Encoding::PLAIN,
1845                    count: 3,
1846                },
1847                PageEncodingStats {
1848                    page_type: PageType::DATA_PAGE,
1849                    encoding: Encoding::RLE,
1850                    count: 5,
1851                },
1852            ])
1853            .set_bloom_filter_offset(Some(6000))
1854            .set_bloom_filter_length(Some(25))
1855            .set_offset_index_offset(Some(7000))
1856            .set_offset_index_length(Some(25))
1857            .set_column_index_offset(Some(8000))
1858            .set_column_index_length(Some(25))
1859            .set_unencoded_byte_array_data_bytes(Some(2000))
1860            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1861            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1862            .build()
1863            .unwrap();
1864
1865        let mut buf = Vec::new();
1866        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1867        col_metadata.write_thrift(&mut writer).unwrap();
1868        let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1869
1870        let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1871            .set_encodings_mask(EncodingMask::new_from_encodings(
1872                [Encoding::PLAIN, Encoding::RLE].iter(),
1873            ))
1874            .set_file_path("file_path".to_owned())
1875            .set_num_values(1000)
1876            .set_compression_codec(CompressionCodec::SNAPPY)
1877            .set_total_compressed_size(2000)
1878            .set_total_uncompressed_size(3000)
1879            .set_data_page_offset(4000)
1880            .set_dictionary_page_offset(Some(5000))
1881            .set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1882                [Encoding::PLAIN, Encoding::RLE].iter(),
1883            ))
1884            .set_bloom_filter_offset(Some(6000))
1885            .set_bloom_filter_length(Some(25))
1886            .set_offset_index_offset(Some(7000))
1887            .set_offset_index_length(Some(25))
1888            .set_column_index_offset(Some(8000))
1889            .set_column_index_length(Some(25))
1890            .set_unencoded_byte_array_data_bytes(Some(2000))
1891            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1892            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1893            .build()
1894            .unwrap();
1895
1896        assert_eq!(col_chunk_res, expected_metadata);
1897    }
1898
1899    #[test]
1900    fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1901        let column_descr = get_test_schema_descr().column(0);
1902        let stats = vec![
1903            PageEncodingStats {
1904                page_type: PageType::DATA_PAGE,
1905                encoding: Encoding::PLAIN,
1906                count: 3,
1907            },
1908            PageEncodingStats {
1909                page_type: PageType::DATA_PAGE,
1910                encoding: Encoding::RLE,
1911                count: 5,
1912            },
1913        ];
1914        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1915            .set_encodings_mask(EncodingMask::new_from_encodings(
1916                [Encoding::PLAIN, Encoding::RLE].iter(),
1917            ))
1918            .set_num_values(1000)
1919            .set_compression_codec(CompressionCodec::SNAPPY)
1920            .set_total_compressed_size(2000)
1921            .set_total_uncompressed_size(3000)
1922            .set_data_page_offset(4000)
1923            .set_page_encoding_stats(stats)
1924            .build()
1925            .unwrap();
1926
1927        let mut buf = Vec::new();
1928        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1929        col_metadata.write_thrift(&mut writer).unwrap();
1930
1931        let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1932        let col_chunk_res =
1933            read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
1934
1935        assert_eq!(col_chunk_res, col_metadata);
1936    }
1937
1938    #[test]
1939    fn test_column_chunk_metadata_thrift_conversion_empty() {
1940        let column_descr = get_test_schema_descr().column(0);
1941
1942        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1943            .build()
1944            .unwrap();
1945
1946        let mut buf = Vec::new();
1947        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1948        col_metadata.write_thrift(&mut writer).unwrap();
1949        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1950
1951        assert_eq!(col_chunk_res, col_metadata);
1952    }
1953
1954    #[test]
1955    fn test_compressed_size() {
1956        let schema_descr = get_test_schema_descr();
1957
1958        let mut columns = vec![];
1959        for column_descr in schema_descr.columns() {
1960            let column = ColumnChunkMetaData::builder(column_descr.clone())
1961                .set_total_compressed_size(500)
1962                .set_total_uncompressed_size(700)
1963                .build()
1964                .unwrap();
1965            columns.push(column);
1966        }
1967        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1968            .set_num_rows(1000)
1969            .set_column_metadata(columns)
1970            .build()
1971            .unwrap();
1972
1973        let compressed_size_res: i64 = row_group_meta.compressed_size();
1974        let compressed_size_exp: i64 = 1000;
1975
1976        assert_eq!(compressed_size_res, compressed_size_exp);
1977    }
1978
1979    #[test]
1980    fn test_memory_size() {
1981        let schema_descr = get_test_schema_descr();
1982
1983        let columns = schema_descr
1984            .columns()
1985            .iter()
1986            .map(|column_descr| {
1987                ColumnChunkMetaData::builder(column_descr.clone())
1988                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1989                    .build()
1990            })
1991            .collect::<Result<Vec<_>>>()
1992            .unwrap();
1993        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1994            .set_num_rows(1000)
1995            .set_column_metadata(columns)
1996            .build()
1997            .unwrap();
1998        let row_group_meta = vec![row_group_meta];
1999
2000        let version = 2;
2001        let num_rows = 1000;
2002        let created_by = Some(String::from("test harness"));
2003        let key_value_metadata = Some(vec![KeyValue::new(
2004            String::from("Foo"),
2005            Some(String::from("bar")),
2006        )]);
2007        let column_orders = Some(vec![
2008            ColumnOrder::UNDEFINED,
2009            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
2010        ]);
2011        let file_metadata = FileMetaData::new(
2012            version,
2013            num_rows,
2014            created_by,
2015            key_value_metadata,
2016            schema_descr.clone(),
2017            column_orders,
2018        );
2019
2020        // Now, add in Exact Statistics
2021        let columns_with_stats = schema_descr
2022            .columns()
2023            .iter()
2024            .map(|column_descr| {
2025                ColumnChunkMetaData::builder(column_descr.clone())
2026                    .set_statistics(Statistics::new::<i32>(
2027                        Some(0),
2028                        Some(100),
2029                        None,
2030                        None,
2031                        false,
2032                    ))
2033                    .build()
2034            })
2035            .collect::<Result<Vec<_>>>()
2036            .unwrap();
2037
2038        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
2039            .set_num_rows(1000)
2040            .set_column_metadata(columns_with_stats)
2041            .build()
2042            .unwrap();
2043        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
2044
2045        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
2046            .set_row_groups(row_group_meta_with_stats)
2047            .build();
2048
2049        #[cfg(not(feature = "encryption"))]
2050        let base_expected_size = 2734;
2051        #[cfg(feature = "encryption")]
2052        let base_expected_size = 2902;
2053
2054        assert_eq!(parquet_meta.memory_size(), base_expected_size);
2055
2056        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
2057        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
2058        let column_index = column_index.build().unwrap();
2059        let native_index = match column_index {
2060            ColumnIndexMetaData::BOOLEAN(index) => index,
2061            _ => panic!("wrong type of column index"),
2062        };
2063
2064        // Now, add in OffsetIndex
2065        let mut offset_index = OffsetIndexBuilder::new();
2066        offset_index.append_row_count(1);
2067        offset_index.append_offset_and_size(2, 3);
2068        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2069        offset_index.append_row_count(1);
2070        offset_index.append_offset_and_size(2, 3);
2071        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
2072        let offset_index = offset_index.build();
2073
2074        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
2075            .set_row_groups(row_group_meta)
2076            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
2077            .set_offset_index(Some(vec![vec![offset_index]]))
2078            .build();
2079
2080        #[cfg(not(feature = "encryption"))]
2081        let bigger_expected_size = 3160;
2082        #[cfg(feature = "encryption")]
2083        let bigger_expected_size = 3328;
2084
2085        // more set fields means more memory usage
2086        assert!(bigger_expected_size > base_expected_size);
2087        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2088    }
2089
2090    #[test]
2091    #[cfg(feature = "encryption")]
2092    fn test_memory_size_with_decryptor() {
2093        use crate::encryption::decrypt::FileDecryptionProperties;
2094        use crate::file::metadata::thrift::encryption::AesGcmV1;
2095
2096        let schema_descr = get_test_schema_descr();
2097
2098        let columns = schema_descr
2099            .columns()
2100            .iter()
2101            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
2102            .collect::<Result<Vec<_>>>()
2103            .unwrap();
2104        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
2105            .set_num_rows(1000)
2106            .set_column_metadata(columns)
2107            .build()
2108            .unwrap();
2109        let row_group_meta = vec![row_group_meta];
2110
2111        let version = 2;
2112        let num_rows = 1000;
2113        let aad_file_unique = vec![1u8; 8];
2114        let aad_prefix = vec![2u8; 8];
2115        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
2116            aad_prefix: Some(aad_prefix.clone()),
2117            aad_file_unique: Some(aad_file_unique.clone()),
2118            supply_aad_prefix: Some(true),
2119        });
2120        let footer_key_metadata = Some(vec![3u8; 8]);
2121        let file_metadata =
2122            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
2123                .with_encryption_algorithm(Some(encryption_algorithm))
2124                .with_footer_signing_key_metadata(footer_key_metadata.clone());
2125
2126        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2127            .set_row_groups(row_group_meta.clone())
2128            .build();
2129
2130        let base_expected_size = 2042;
2131        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
2132
2133        let footer_key = "0123456789012345".as_bytes();
2134        let column_key = "1234567890123450".as_bytes();
2135        let mut decryption_properties_builder =
2136            FileDecryptionProperties::builder(footer_key.to_vec())
2137                .with_aad_prefix(aad_prefix.clone());
2138        for column in schema_descr.columns() {
2139            decryption_properties_builder = decryption_properties_builder
2140                .with_column_key(&column.path().string(), column_key.to_vec());
2141        }
2142        let decryption_properties = decryption_properties_builder.build().unwrap();
2143        let decryptor = FileDecryptor::new(
2144            &decryption_properties,
2145            footer_key_metadata.as_deref(),
2146            aad_file_unique,
2147            aad_prefix,
2148        )
2149        .unwrap();
2150
2151        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2152            .set_row_groups(row_group_meta.clone())
2153            .set_file_decryptor(Some(decryptor))
2154            .build();
2155
2156        let expected_size_with_decryptor = 3056;
2157        assert!(expected_size_with_decryptor > base_expected_size);
2158
2159        assert_eq!(
2160            parquet_meta_data.memory_size(),
2161            expected_size_with_decryptor
2162        );
2163    }
2164
2165    /// Returns sample schema descriptor so we can create column metadata.
2166    fn get_test_schema_descr() -> SchemaDescPtr {
2167        let schema = SchemaType::group_type_builder("schema")
2168            .with_fields(vec![
2169                Arc::new(
2170                    SchemaType::primitive_type_builder("a", Type::INT32)
2171                        .build()
2172                        .unwrap(),
2173                ),
2174                Arc::new(
2175                    SchemaType::primitive_type_builder("b", Type::INT32)
2176                        .build()
2177                        .unwrap(),
2178                ),
2179            ])
2180            .build()
2181            .unwrap();
2182
2183        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2184    }
2185}