parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Most users should use these structures to interact with Parquet metadata.
21//! The [crate::format] module contains lower level structures generated from the
22//! Parquet thrift definition.
23//!
24//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
25//!   file footer.
26//!
27//! * [`FileMetaData`]: File level metadata such as schema, row counts and
28//!   version.
29//!
30//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
31//!   location and number of rows, and column chunks.
32//!
33//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
34//!   within a Row Group including encoding and compression information,
35//!   number of values, statistics, etc.
36//!
37//! # APIs for working with Parquet Metadata
38//!
39//! The Parquet readers and writers in this crate handle reading and writing
40//! metadata into parquet files. To work with metadata directly,
41//! the following APIs are available:
42//!
43//! * [`ParquetMetaDataReader`] for reading
44//! * [`ParquetMetaDataWriter`] for writing.
45//!
46//! [`ParquetMetaDataReader`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html
47//! [`ParquetMetaDataWriter`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataWriter.html
48//!
49//! # Examples
50//!
51//! Please see [`external_metadata.rs`]
52//!
53//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
54//!
55//! # Metadata Encodings and Structures
56//!
57//! There are three different encodings of Parquet Metadata in this crate:
58//!
59//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
60//!    [parquet.thrift]
61//!
62//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
63//!    from [parquet.thrift]. These structures are low level and mirror
64//!    the thrift definitions.
65//!
66//! 3. [`file::metadata`] (this module): Easier to use Rust structures
67//!    with a more idiomatic API. Note that, confusingly, some but not all
68//!    of these structures have the same name as the [`format`] structures.
69//!
70//! [`format`]: crate::format
71//! [`file::metadata`]: crate::file::metadata
72//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
73//!
74//! Graphically, this is how the different structures relate to each other:
75//!
76//! ```text
77//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
78//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
79//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
80//!                            └──────────────┘     │         └───────────────────────┘ │
81//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
82//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
83//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
84//!                                     ...         │                   ...             │
85//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
86//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
87//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
88//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
89//!
90//!                          format::meta structures          file::metadata structures
91//!
92//!                         * Same name, different struct
93//! ```
94mod memory;
95pub(crate) mod reader;
96mod writer;
97
98use crate::basic::{ColumnOrder, Compression, Encoding, Type};
99#[cfg(feature = "encryption")]
100use crate::encryption::{
101    decrypt::FileDecryptor,
102    modules::{create_module_aad, ModuleType},
103};
104use crate::errors::{ParquetError, Result};
105#[cfg(feature = "encryption")]
106use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
107pub(crate) use crate::file::metadata::memory::HeapSize;
108use crate::file::page_encoding_stats::{self, PageEncodingStats};
109use crate::file::page_index::index::Index;
110use crate::file::page_index::offset_index::OffsetIndexMetaData;
111use crate::file::statistics::{self, Statistics};
112use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData;
113use crate::format::{
114    BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
115    SizeStatistics, SortingColumn,
116};
117use crate::schema::types::{
118    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
119    Type as SchemaType,
120};
121#[cfg(feature = "encryption")]
122use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
123pub use reader::{FooterTail, ParquetMetaDataReader};
124use std::ops::Range;
125use std::sync::Arc;
126pub use writer::ParquetMetaDataWriter;
127pub(crate) use writer::ThriftMetadataWriter;
128
129/// Page level statistics for each column chunk of each row group.
130///
131/// This structure is an in-memory representation of multiple [`ColumnIndex`]
132/// structures in a parquet file footer, as described in the Parquet [PageIndex
133/// documentation]. Each [`Index`] holds statistics about all the pages in a
134/// particular column chunk.
135///
136/// `column_index[row_group_number][column_number]` holds the
137/// [`Index`] corresponding to column `column_number` of row group
138/// `row_group_number`.
139///
140/// For example `column_index[2][3]` holds the [`Index`] for the fourth
141/// column in the third row group of the parquet file.
142///
143/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
144pub type ParquetColumnIndex = Vec<Vec<Index>>;
145
146/// [`OffsetIndexMetaData`] for each data page of each row group of each column
147///
148/// This structure is the parsed representation of the [`OffsetIndex`] from the
149/// Parquet file footer, as described in the Parquet [PageIndex documentation].
150///
151/// `offset_index[row_group_number][column_number]` holds
152/// the [`OffsetIndexMetaData`] corresponding to column
153/// `column_number`of row group `row_group_number`.
154///
155/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
156pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
157
158/// Parsed metadata for a single Parquet file
159///
160/// This structure is stored in the footer of Parquet files, in the format
161/// defined by [`parquet.thrift`].
162///
163/// # Overview
164/// The fields of this structure are:
165/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
166/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
167/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
168///
169/// This structure is read by the various readers in this crate or can be read
170/// directly from a file using the [`ParquetMetaDataReader`] struct.
171///
172/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
173///
174/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
175#[derive(Debug, Clone, PartialEq)]
176pub struct ParquetMetaData {
177    /// File level metadata
178    file_metadata: FileMetaData,
179    /// Row group metadata
180    row_groups: Vec<RowGroupMetaData>,
181    /// Page level index for each page in each column chunk
182    column_index: Option<ParquetColumnIndex>,
183    /// Offset index for each page in each column chunk
184    offset_index: Option<ParquetOffsetIndex>,
185    /// Optional file decryptor
186    #[cfg(feature = "encryption")]
187    file_decryptor: Option<FileDecryptor>,
188}
189
190impl ParquetMetaData {
191    /// Creates Parquet metadata from file metadata and a list of row
192    /// group metadata
193    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
194        ParquetMetaData {
195            file_metadata,
196            row_groups,
197            #[cfg(feature = "encryption")]
198            file_decryptor: None,
199            column_index: None,
200            offset_index: None,
201        }
202    }
203
204    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
205    /// encrypted data.
206    #[cfg(feature = "encryption")]
207    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
208        self.file_decryptor = file_decryptor;
209    }
210
211    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
212    pub fn into_builder(self) -> ParquetMetaDataBuilder {
213        self.into()
214    }
215
216    /// Returns file metadata as reference.
217    pub fn file_metadata(&self) -> &FileMetaData {
218        &self.file_metadata
219    }
220
221    /// Returns file decryptor as reference.
222    #[cfg(feature = "encryption")]
223    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
224        self.file_decryptor.as_ref()
225    }
226
227    /// Returns number of row groups in this file.
228    pub fn num_row_groups(&self) -> usize {
229        self.row_groups.len()
230    }
231
232    /// Returns row group metadata for `i`th position.
233    /// Position should be less than number of row groups `num_row_groups`.
234    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
235        &self.row_groups[i]
236    }
237
238    /// Returns slice of row groups in this file.
239    pub fn row_groups(&self) -> &[RowGroupMetaData] {
240        &self.row_groups
241    }
242
243    /// Returns the column index for this file if loaded
244    ///
245    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
246    /// [ArrowReaderOptions::with_page_index] was set to false.
247    ///
248    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
249    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
250        self.column_index.as_ref()
251    }
252
253    /// Returns offset indexes in this file, if loaded
254    ///
255    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
256    /// [ArrowReaderOptions::with_page_index] was set to false.
257    ///
258    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
259    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
260        self.offset_index.as_ref()
261    }
262
263    /// Estimate of the bytes allocated to store `ParquetMetadata`
264    ///
265    /// # Notes:
266    ///
267    /// 1. Includes size of self
268    ///
269    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
270    ///    [`RowGroupMetaData`].
271    ///
272    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
273    ///    means `memory_size` will over estimate the memory size if such pointers
274    ///    are shared.
275    ///
276    /// 4. Does not include any allocator overheads
277    pub fn memory_size(&self) -> usize {
278        std::mem::size_of::<Self>()
279            + self.file_metadata.heap_size()
280            + self.row_groups.heap_size()
281            + self.column_index.heap_size()
282            + self.offset_index.heap_size()
283    }
284
285    /// Override the column index
286    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
287        self.column_index = index;
288    }
289
290    /// Override the offset index
291    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
292        self.offset_index = index;
293    }
294}
295
296/// A builder for creating / manipulating [`ParquetMetaData`]
297///
298/// # Example creating a new [`ParquetMetaData`]
299///
300///```no_run
301/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
302/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
303/// // Create a new builder given the file metadata
304/// let file_metadata = get_file_metadata();
305/// // Create a row group
306/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
307///    .set_num_rows(100)
308///    // ... (A real row group needs more than just the number of rows)
309///    .build()
310///    .unwrap();
311/// // Create the final metadata
312/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
313///   .add_row_group(row_group)
314///   .build();
315/// ```
316///
317/// # Example modifying an existing [`ParquetMetaData`]
318/// ```no_run
319/// # use parquet::file::metadata::ParquetMetaData;
320/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
321/// // Modify the metadata so only the last RowGroup remains
322/// let metadata: ParquetMetaData = load_metadata();
323/// let mut builder = metadata.into_builder();
324///
325/// // Take existing row groups to modify
326/// let mut row_groups = builder.take_row_groups();
327/// let last_row_group = row_groups.pop().unwrap();
328///
329/// let metadata = builder
330///   .add_row_group(last_row_group)
331///   .build();
332/// ```
333pub struct ParquetMetaDataBuilder(ParquetMetaData);
334
335impl ParquetMetaDataBuilder {
336    /// Create a new builder from a file metadata, with no row groups
337    pub fn new(file_meta_data: FileMetaData) -> Self {
338        Self(ParquetMetaData::new(file_meta_data, vec![]))
339    }
340
341    /// Create a new builder from an existing ParquetMetaData
342    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
343        Self(metadata)
344    }
345
346    /// Adds a row group to the metadata
347    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
348        self.0.row_groups.push(row_group);
349        self
350    }
351
352    /// Sets all the row groups to the specified list
353    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
354        self.0.row_groups = row_groups;
355        self
356    }
357
358    /// Takes ownership of the row groups in this builder, and clears the list
359    /// of row groups.
360    ///
361    /// This can be used for more efficient creation of a new ParquetMetaData
362    /// from an existing one.
363    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
364        std::mem::take(&mut self.0.row_groups)
365    }
366
367    /// Return a reference to the current row groups
368    pub fn row_groups(&self) -> &[RowGroupMetaData] {
369        &self.0.row_groups
370    }
371
372    /// Sets the column index
373    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
374        self.0.column_index = column_index;
375        self
376    }
377
378    /// Returns the current column index from the builder, replacing it with `None`
379    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
380        std::mem::take(&mut self.0.column_index)
381    }
382
383    /// Return a reference to the current column index, if any
384    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
385        self.0.column_index.as_ref()
386    }
387
388    /// Sets the offset index
389    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
390        self.0.offset_index = offset_index;
391        self
392    }
393
394    /// Returns the current offset index from the builder, replacing it with `None`
395    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
396        std::mem::take(&mut self.0.offset_index)
397    }
398
399    /// Return a reference to the current offset index, if any
400    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
401        self.0.offset_index.as_ref()
402    }
403
404    /// Creates a new ParquetMetaData from the builder
405    pub fn build(self) -> ParquetMetaData {
406        let Self(metadata) = self;
407        metadata
408    }
409}
410
411impl From<ParquetMetaData> for ParquetMetaDataBuilder {
412    fn from(meta_data: ParquetMetaData) -> Self {
413        Self(meta_data)
414    }
415}
416
417/// A key-value pair for [`FileMetaData`].
418pub type KeyValue = crate::format::KeyValue;
419
420/// Reference counted pointer for [`FileMetaData`].
421pub type FileMetaDataPtr = Arc<FileMetaData>;
422
423/// File level metadata for a Parquet file.
424///
425/// Includes the version of the file, metadata, number of rows, schema, and column orders
426#[derive(Debug, Clone, PartialEq)]
427pub struct FileMetaData {
428    version: i32,
429    num_rows: i64,
430    created_by: Option<String>,
431    key_value_metadata: Option<Vec<KeyValue>>,
432    schema_descr: SchemaDescPtr,
433    column_orders: Option<Vec<ColumnOrder>>,
434}
435
436impl FileMetaData {
437    /// Creates new file metadata.
438    pub fn new(
439        version: i32,
440        num_rows: i64,
441        created_by: Option<String>,
442        key_value_metadata: Option<Vec<KeyValue>>,
443        schema_descr: SchemaDescPtr,
444        column_orders: Option<Vec<ColumnOrder>>,
445    ) -> Self {
446        FileMetaData {
447            version,
448            num_rows,
449            created_by,
450            key_value_metadata,
451            schema_descr,
452            column_orders,
453        }
454    }
455
456    /// Returns version of this file.
457    pub fn version(&self) -> i32 {
458        self.version
459    }
460
461    /// Returns number of rows in the file.
462    pub fn num_rows(&self) -> i64 {
463        self.num_rows
464    }
465
466    /// String message for application that wrote this file.
467    ///
468    /// This should have the following format:
469    /// `<application> version <application version> (build <application build hash>)`.
470    ///
471    /// ```shell
472    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
473    /// ```
474    pub fn created_by(&self) -> Option<&str> {
475        self.created_by.as_deref()
476    }
477
478    /// Returns key_value_metadata of this file.
479    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
480        self.key_value_metadata.as_ref()
481    }
482
483    /// Returns Parquet [`Type`] that describes schema in this file.
484    ///
485    /// [`Type`]: crate::schema::types::Type
486    pub fn schema(&self) -> &SchemaType {
487        self.schema_descr.root_schema()
488    }
489
490    /// Returns a reference to schema descriptor.
491    pub fn schema_descr(&self) -> &SchemaDescriptor {
492        &self.schema_descr
493    }
494
495    /// Returns reference counted clone for schema descriptor.
496    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
497        self.schema_descr.clone()
498    }
499
500    /// Column (sort) order used for `min` and `max` values of each column in this file.
501    ///
502    /// Each column order corresponds to one column, determined by its position in the
503    /// list, matching the position of the column in the schema.
504    ///
505    /// When `None` is returned, there are no column orders available, and each column
506    /// should be assumed to have undefined (legacy) column order.
507    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
508        self.column_orders.as_ref()
509    }
510
511    /// Returns column order for `i`th column in this file.
512    /// If column orders are not available, returns undefined (legacy) column order.
513    pub fn column_order(&self, i: usize) -> ColumnOrder {
514        self.column_orders
515            .as_ref()
516            .map(|data| data[i])
517            .unwrap_or(ColumnOrder::UNDEFINED)
518    }
519}
520
521/// Reference counted pointer for [`RowGroupMetaData`].
522pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
523
524/// Metadata for a row group
525///
526/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
527/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
528#[derive(Debug, Clone, PartialEq)]
529pub struct RowGroupMetaData {
530    columns: Vec<ColumnChunkMetaData>,
531    num_rows: i64,
532    sorting_columns: Option<Vec<SortingColumn>>,
533    total_byte_size: i64,
534    schema_descr: SchemaDescPtr,
535    /// We can't infer from file offset of first column since there may empty columns in row group.
536    file_offset: Option<i64>,
537    /// Ordinal position of this row group in file
538    ordinal: Option<i16>,
539}
540
541impl RowGroupMetaData {
542    /// Returns builder for row group metadata.
543    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
544        RowGroupMetaDataBuilder::new(schema_descr)
545    }
546
547    /// Number of columns in this row group.
548    pub fn num_columns(&self) -> usize {
549        self.columns.len()
550    }
551
552    /// Returns column chunk metadata for `i`th column.
553    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
554        &self.columns[i]
555    }
556
557    /// Returns slice of column chunk metadata.
558    pub fn columns(&self) -> &[ColumnChunkMetaData] {
559        &self.columns
560    }
561
562    /// Returns mutable slice of column chunk metadata.
563    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
564        &mut self.columns
565    }
566
567    /// Number of rows in this row group.
568    pub fn num_rows(&self) -> i64 {
569        self.num_rows
570    }
571
572    /// Returns the sort ordering of the rows in this RowGroup if any
573    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
574        self.sorting_columns.as_ref()
575    }
576
577    /// Total byte size of all uncompressed column data in this row group.
578    pub fn total_byte_size(&self) -> i64 {
579        self.total_byte_size
580    }
581
582    /// Total size of all compressed column data in this row group.
583    pub fn compressed_size(&self) -> i64 {
584        self.columns.iter().map(|c| c.total_compressed_size).sum()
585    }
586
587    /// Returns reference to a schema descriptor.
588    pub fn schema_descr(&self) -> &SchemaDescriptor {
589        self.schema_descr.as_ref()
590    }
591
592    /// Returns reference counted clone of schema descriptor.
593    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
594        self.schema_descr.clone()
595    }
596
597    /// Returns ordinal position of this row group in file.
598    ///
599    /// For example if this is the first row group in the file, this will return 0.
600    /// If this is the second row group in the file, this will return 1.
601    #[inline(always)]
602    pub fn ordinal(&self) -> Option<i16> {
603        self.ordinal
604    }
605
606    /// Returns file offset of this row group in file.
607    #[inline(always)]
608    pub fn file_offset(&self) -> Option<i64> {
609        self.file_offset
610    }
611
612    /// Method to convert from encrypted Thrift.
613    #[cfg(feature = "encryption")]
614    fn from_encrypted_thrift(
615        schema_descr: SchemaDescPtr,
616        mut rg: RowGroup,
617        decryptor: Option<&FileDecryptor>,
618    ) -> Result<RowGroupMetaData> {
619        if schema_descr.num_columns() != rg.columns.len() {
620            return Err(general_err!(
621                "Column count mismatch. Schema has {} columns while Row Group has {}",
622                schema_descr.num_columns(),
623                rg.columns.len()
624            ));
625        }
626        let total_byte_size = rg.total_byte_size;
627        let num_rows = rg.num_rows;
628        let mut columns = vec![];
629
630        for (i, (mut c, d)) in rg
631            .columns
632            .drain(0..)
633            .zip(schema_descr.columns())
634            .enumerate()
635        {
636            // Read encrypted metadata if it's present and we have a decryptor.
637            if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
638                let column_decryptor = match c.crypto_metadata.as_ref() {
639                    None => {
640                        return Err(general_err!(
641                            "No crypto_metadata is set for column '{}', which has encrypted metadata",
642                            d.path().string()
643                        ));
644                    }
645                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
646                        let column_name = crypto_metadata.path_in_schema.join(".");
647                        decryptor.get_column_metadata_decryptor(
648                            column_name.as_str(),
649                            crypto_metadata.key_metadata.as_deref(),
650                        )?
651                    }
652                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
653                        decryptor.get_footer_decryptor()?
654                    }
655                };
656
657                let column_aad = create_module_aad(
658                    decryptor.file_aad(),
659                    ModuleType::ColumnMetaData,
660                    rg.ordinal.unwrap() as usize,
661                    i,
662                    None,
663                )?;
664
665                let buf = c.encrypted_column_metadata.clone().unwrap();
666                let decrypted_cc_buf = column_decryptor
667                    .decrypt(buf.as_slice(), column_aad.as_ref())
668                    .map_err(|_| {
669                        general_err!(
670                            "Unable to decrypt column '{}', perhaps the column key is wrong?",
671                            d.path().string()
672                        )
673                    })?;
674
675                let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
676                c.meta_data = Some(ColumnMetaData::read_from_in_protocol(&mut prot)?);
677            }
678            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
679        }
680
681        let sorting_columns = rg.sorting_columns;
682        Ok(RowGroupMetaData {
683            columns,
684            num_rows,
685            sorting_columns,
686            total_byte_size,
687            schema_descr,
688            file_offset: rg.file_offset,
689            ordinal: rg.ordinal,
690        })
691    }
692
693    /// Method to convert from Thrift.
694    pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
695        if schema_descr.num_columns() != rg.columns.len() {
696            return Err(general_err!(
697                "Column count mismatch. Schema has {} columns while Row Group has {}",
698                schema_descr.num_columns(),
699                rg.columns.len()
700            ));
701        }
702        let total_byte_size = rg.total_byte_size;
703        let num_rows = rg.num_rows;
704        let mut columns = vec![];
705
706        for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
707            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
708        }
709
710        let sorting_columns = rg.sorting_columns;
711        Ok(RowGroupMetaData {
712            columns,
713            num_rows,
714            sorting_columns,
715            total_byte_size,
716            schema_descr,
717            file_offset: rg.file_offset,
718            ordinal: rg.ordinal,
719        })
720    }
721
722    /// Method to convert to Thrift.
723    pub fn to_thrift(&self) -> RowGroup {
724        RowGroup {
725            columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
726            total_byte_size: self.total_byte_size,
727            num_rows: self.num_rows,
728            sorting_columns: self.sorting_columns().cloned(),
729            file_offset: self.file_offset(),
730            total_compressed_size: Some(self.compressed_size()),
731            ordinal: self.ordinal,
732        }
733    }
734
735    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
736    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
737        RowGroupMetaDataBuilder(self)
738    }
739}
740
741/// Builder for row group metadata.
742pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
743
744impl RowGroupMetaDataBuilder {
745    /// Creates new builder from schema descriptor.
746    fn new(schema_descr: SchemaDescPtr) -> Self {
747        Self(RowGroupMetaData {
748            columns: Vec::with_capacity(schema_descr.num_columns()),
749            schema_descr,
750            file_offset: None,
751            num_rows: 0,
752            sorting_columns: None,
753            total_byte_size: 0,
754            ordinal: None,
755        })
756    }
757
758    /// Sets number of rows in this row group.
759    pub fn set_num_rows(mut self, value: i64) -> Self {
760        self.0.num_rows = value;
761        self
762    }
763
764    /// Sets the sorting order for columns
765    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
766        self.0.sorting_columns = value;
767        self
768    }
769
770    /// Sets total size in bytes for this row group.
771    pub fn set_total_byte_size(mut self, value: i64) -> Self {
772        self.0.total_byte_size = value;
773        self
774    }
775
776    /// Takes ownership of the the column metadata in this builder, and clears
777    /// the list of columns.
778    ///
779    /// This can be used for more efficient creation of a new RowGroupMetaData
780    /// from an existing one.
781    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
782        std::mem::take(&mut self.0.columns)
783    }
784
785    /// Sets column metadata for this row group.
786    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
787        self.0.columns = value;
788        self
789    }
790
791    /// Adds a column metadata to this row group
792    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
793        self.0.columns.push(value);
794        self
795    }
796
797    /// Sets ordinal for this row group.
798    pub fn set_ordinal(mut self, value: i16) -> Self {
799        self.0.ordinal = Some(value);
800        self
801    }
802
803    /// Sets file offset for this row group.
804    pub fn set_file_offset(mut self, value: i64) -> Self {
805        self.0.file_offset = Some(value);
806        self
807    }
808
809    /// Builds row group metadata.
810    pub fn build(self) -> Result<RowGroupMetaData> {
811        if self.0.schema_descr.num_columns() != self.0.columns.len() {
812            return Err(general_err!(
813                "Column length mismatch: {} != {}",
814                self.0.schema_descr.num_columns(),
815                self.0.columns.len()
816            ));
817        }
818
819        Ok(self.0)
820    }
821}
822
823/// Metadata for a column chunk.
824#[derive(Debug, Clone, PartialEq)]
825pub struct ColumnChunkMetaData {
826    column_descr: ColumnDescPtr,
827    encodings: Vec<Encoding>,
828    file_path: Option<String>,
829    file_offset: i64,
830    num_values: i64,
831    compression: Compression,
832    total_compressed_size: i64,
833    total_uncompressed_size: i64,
834    data_page_offset: i64,
835    index_page_offset: Option<i64>,
836    dictionary_page_offset: Option<i64>,
837    statistics: Option<Statistics>,
838    encoding_stats: Option<Vec<PageEncodingStats>>,
839    bloom_filter_offset: Option<i64>,
840    bloom_filter_length: Option<i32>,
841    offset_index_offset: Option<i64>,
842    offset_index_length: Option<i32>,
843    column_index_offset: Option<i64>,
844    column_index_length: Option<i32>,
845    unencoded_byte_array_data_bytes: Option<i64>,
846    repetition_level_histogram: Option<LevelHistogram>,
847    definition_level_histogram: Option<LevelHistogram>,
848    #[cfg(feature = "encryption")]
849    column_crypto_metadata: Option<ColumnCryptoMetaData>,
850}
851
852/// Histograms for repetition and definition levels.
853///
854/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
855/// values at level `i`.
856///
857/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
858/// number of rows with level 1, and so on.
859///
860#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
861pub struct LevelHistogram {
862    inner: Vec<i64>,
863}
864
865impl LevelHistogram {
866    /// Creates a new level histogram data.
867    ///
868    /// Length will be `max_level + 1`.
869    ///
870    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
871    pub fn try_new(max_level: i16) -> Option<Self> {
872        if max_level > 0 {
873            Some(Self {
874                inner: vec![0; max_level as usize + 1],
875            })
876        } else {
877            None
878        }
879    }
880    /// Returns a reference to the the histogram's values.
881    pub fn values(&self) -> &[i64] {
882        &self.inner
883    }
884
885    /// Return the inner vector, consuming self
886    pub fn into_inner(self) -> Vec<i64> {
887        self.inner
888    }
889
890    /// Returns the histogram value at the given index.
891    ///
892    /// The value of `i` is the number of values with level `i`. For example,
893    /// `get(1)` returns the number of values with level 1.
894    ///
895    /// Returns `None` if the index is out of bounds.
896    pub fn get(&self, index: usize) -> Option<i64> {
897        self.inner.get(index).copied()
898    }
899
900    /// Adds the values from the other histogram to this histogram
901    ///
902    /// # Panics
903    /// If the histograms have different lengths
904    pub fn add(&mut self, other: &Self) {
905        assert_eq!(self.len(), other.len());
906        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
907            *dst += src;
908        }
909    }
910
911    /// return the length of the histogram
912    pub fn len(&self) -> usize {
913        self.inner.len()
914    }
915
916    /// returns if the histogram is empty
917    pub fn is_empty(&self) -> bool {
918        self.inner.is_empty()
919    }
920
921    /// Sets the values of all histogram levels to 0.
922    pub fn reset(&mut self) {
923        for value in self.inner.iter_mut() {
924            *value = 0;
925        }
926    }
927
928    /// Updates histogram values using provided repetition levels
929    ///
930    /// # Panics
931    /// if any of the levels is greater than the length of the histogram (
932    /// the argument supplied to [`Self::try_new`])
933    pub fn update_from_levels(&mut self, levels: &[i16]) {
934        for &level in levels {
935            self.inner[level as usize] += 1;
936        }
937    }
938}
939
940impl From<Vec<i64>> for LevelHistogram {
941    fn from(inner: Vec<i64>) -> Self {
942        Self { inner }
943    }
944}
945
946impl From<LevelHistogram> for Vec<i64> {
947    fn from(value: LevelHistogram) -> Self {
948        value.into_inner()
949    }
950}
951
952impl HeapSize for LevelHistogram {
953    fn heap_size(&self) -> usize {
954        self.inner.heap_size()
955    }
956}
957
958/// Represents common operations for a column chunk.
959impl ColumnChunkMetaData {
960    /// Returns builder for column chunk metadata.
961    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
962        ColumnChunkMetaDataBuilder::new(column_descr)
963    }
964
965    /// File where the column chunk is stored.
966    ///
967    /// If not set, assumed to belong to the same file as the metadata.
968    /// This path is relative to the current file.
969    pub fn file_path(&self) -> Option<&str> {
970        self.file_path.as_deref()
971    }
972
973    /// Byte offset of `ColumnMetaData` in `file_path()`.
974    ///
975    /// Note that the meaning of this field has been inconsistent between implementations
976    /// so its use has since been deprecated in the Parquet specification. Modern implementations
977    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
978    /// `ColumnChunk` struct.
979    pub fn file_offset(&self) -> i64 {
980        self.file_offset
981    }
982
983    /// Type of this column. Must be primitive.
984    pub fn column_type(&self) -> Type {
985        self.column_descr.physical_type()
986    }
987
988    /// Path (or identifier) of this column.
989    pub fn column_path(&self) -> &ColumnPath {
990        self.column_descr.path()
991    }
992
993    /// Descriptor for this column.
994    pub fn column_descr(&self) -> &ColumnDescriptor {
995        self.column_descr.as_ref()
996    }
997
998    /// Reference counted clone of descriptor for this column.
999    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
1000        self.column_descr.clone()
1001    }
1002
1003    /// All encodings used for this column.
1004    pub fn encodings(&self) -> &Vec<Encoding> {
1005        &self.encodings
1006    }
1007
1008    /// Total number of values in this column chunk.
1009    pub fn num_values(&self) -> i64 {
1010        self.num_values
1011    }
1012
1013    /// Compression for this column.
1014    pub fn compression(&self) -> Compression {
1015        self.compression
1016    }
1017
1018    /// Returns the total compressed data size of this column chunk.
1019    pub fn compressed_size(&self) -> i64 {
1020        self.total_compressed_size
1021    }
1022
1023    /// Returns the total uncompressed data size of this column chunk.
1024    pub fn uncompressed_size(&self) -> i64 {
1025        self.total_uncompressed_size
1026    }
1027
1028    /// Returns the offset for the column data.
1029    pub fn data_page_offset(&self) -> i64 {
1030        self.data_page_offset
1031    }
1032
1033    /// Returns the offset for the index page.
1034    pub fn index_page_offset(&self) -> Option<i64> {
1035        self.index_page_offset
1036    }
1037
1038    /// Returns the offset for the dictionary page, if any.
1039    pub fn dictionary_page_offset(&self) -> Option<i64> {
1040        self.dictionary_page_offset
1041    }
1042
1043    /// Returns the offset and length in bytes of the column chunk within the file
1044    pub fn byte_range(&self) -> (u64, u64) {
1045        let col_start = match self.dictionary_page_offset() {
1046            Some(dictionary_page_offset) => dictionary_page_offset,
1047            None => self.data_page_offset(),
1048        };
1049        let col_len = self.compressed_size();
1050        assert!(
1051            col_start >= 0 && col_len >= 0,
1052            "column start and length should not be negative"
1053        );
1054        (col_start as u64, col_len as u64)
1055    }
1056
1057    /// Returns statistics that are set for this column chunk,
1058    /// or `None` if no statistics are available.
1059    pub fn statistics(&self) -> Option<&Statistics> {
1060        self.statistics.as_ref()
1061    }
1062
1063    /// Returns the offset for the page encoding stats,
1064    /// or `None` if no page encoding stats are available.
1065    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1066        self.encoding_stats.as_ref()
1067    }
1068
1069    /// Returns the offset for the bloom filter.
1070    pub fn bloom_filter_offset(&self) -> Option<i64> {
1071        self.bloom_filter_offset
1072    }
1073
1074    /// Returns the offset for the bloom filter.
1075    pub fn bloom_filter_length(&self) -> Option<i32> {
1076        self.bloom_filter_length
1077    }
1078
1079    /// Returns the offset for the column index.
1080    pub fn column_index_offset(&self) -> Option<i64> {
1081        self.column_index_offset
1082    }
1083
1084    /// Returns the offset for the column index length.
1085    pub fn column_index_length(&self) -> Option<i32> {
1086        self.column_index_length
1087    }
1088
1089    /// Returns the range for the offset index if any
1090    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1091        let offset = u64::try_from(self.column_index_offset?).ok()?;
1092        let length = u64::try_from(self.column_index_length?).ok()?;
1093        Some(offset..(offset + length))
1094    }
1095
1096    /// Returns the offset for the offset index.
1097    pub fn offset_index_offset(&self) -> Option<i64> {
1098        self.offset_index_offset
1099    }
1100
1101    /// Returns the offset for the offset index length.
1102    pub fn offset_index_length(&self) -> Option<i32> {
1103        self.offset_index_length
1104    }
1105
1106    /// Returns the range for the offset index if any
1107    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1108        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1109        let length = u64::try_from(self.offset_index_length?).ok()?;
1110        Some(offset..(offset + length))
1111    }
1112
1113    /// Returns the number of bytes of variable length data after decoding.
1114    ///
1115    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1116    /// writers.
1117    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1118        self.unencoded_byte_array_data_bytes
1119    }
1120
1121    /// Returns the repetition level histogram.
1122    ///
1123    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1124    /// `vec[0]` indicates how many rows the page contains.
1125    /// This field may not be set by older writers.
1126    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1127        self.repetition_level_histogram.as_ref()
1128    }
1129
1130    /// Returns the definition level histogram.
1131    ///
1132    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1133    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1134    /// This field may not be set by older writers.
1135    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1136        self.definition_level_histogram.as_ref()
1137    }
1138
1139    /// Returns the encryption metadata for this column chunk.
1140    #[cfg(feature = "encryption")]
1141    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1142        self.column_crypto_metadata.as_ref()
1143    }
1144
1145    /// Method to convert from Thrift.
1146    pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1147        if cc.meta_data.is_none() {
1148            return Err(general_err!("Expected to have column metadata"));
1149        }
1150        let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1151        let column_type = Type::try_from(col_metadata.type_)?;
1152        let encodings = col_metadata
1153            .encodings
1154            .drain(0..)
1155            .map(Encoding::try_from)
1156            .collect::<Result<_>>()?;
1157        let compression = Compression::try_from(col_metadata.codec)?;
1158        let file_path = cc.file_path;
1159        let file_offset = cc.file_offset;
1160        let num_values = col_metadata.num_values;
1161        let total_compressed_size = col_metadata.total_compressed_size;
1162        let total_uncompressed_size = col_metadata.total_uncompressed_size;
1163        let data_page_offset = col_metadata.data_page_offset;
1164        let index_page_offset = col_metadata.index_page_offset;
1165        let dictionary_page_offset = col_metadata.dictionary_page_offset;
1166        let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1167        let encoding_stats = col_metadata
1168            .encoding_stats
1169            .as_ref()
1170            .map(|vec| {
1171                vec.iter()
1172                    .map(page_encoding_stats::try_from_thrift)
1173                    .collect::<Result<_>>()
1174            })
1175            .transpose()?;
1176        let bloom_filter_offset = col_metadata.bloom_filter_offset;
1177        let bloom_filter_length = col_metadata.bloom_filter_length;
1178        let offset_index_offset = cc.offset_index_offset;
1179        let offset_index_length = cc.offset_index_length;
1180        let column_index_offset = cc.column_index_offset;
1181        let column_index_length = cc.column_index_length;
1182        let (
1183            unencoded_byte_array_data_bytes,
1184            repetition_level_histogram,
1185            definition_level_histogram,
1186        ) = if let Some(size_stats) = col_metadata.size_statistics {
1187            (
1188                size_stats.unencoded_byte_array_data_bytes,
1189                size_stats.repetition_level_histogram,
1190                size_stats.definition_level_histogram,
1191            )
1192        } else {
1193            (None, None, None)
1194        };
1195
1196        let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1197        let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1198
1199        #[cfg(feature = "encryption")]
1200        let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata {
1201            Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?)
1202        } else {
1203            None
1204        };
1205
1206        let result = ColumnChunkMetaData {
1207            column_descr,
1208            encodings,
1209            file_path,
1210            file_offset,
1211            num_values,
1212            compression,
1213            total_compressed_size,
1214            total_uncompressed_size,
1215            data_page_offset,
1216            index_page_offset,
1217            dictionary_page_offset,
1218            statistics,
1219            encoding_stats,
1220            bloom_filter_offset,
1221            bloom_filter_length,
1222            offset_index_offset,
1223            offset_index_length,
1224            column_index_offset,
1225            column_index_length,
1226            unencoded_byte_array_data_bytes,
1227            repetition_level_histogram,
1228            definition_level_histogram,
1229            #[cfg(feature = "encryption")]
1230            column_crypto_metadata,
1231        };
1232        Ok(result)
1233    }
1234
1235    /// Method to convert to Thrift.
1236    pub fn to_thrift(&self) -> ColumnChunk {
1237        let column_metadata = self.to_column_metadata_thrift();
1238
1239        ColumnChunk {
1240            file_path: self.file_path().map(|s| s.to_owned()),
1241            file_offset: self.file_offset,
1242            meta_data: Some(column_metadata),
1243            offset_index_offset: self.offset_index_offset,
1244            offset_index_length: self.offset_index_length,
1245            column_index_offset: self.column_index_offset,
1246            column_index_length: self.column_index_length,
1247            crypto_metadata: self.column_crypto_metadata_thrift(),
1248            encrypted_column_metadata: None,
1249        }
1250    }
1251
1252    /// Method to convert to Thrift `ColumnMetaData`
1253    pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1254        let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1255            || self.repetition_level_histogram.is_some()
1256            || self.definition_level_histogram.is_some()
1257        {
1258            let repetition_level_histogram = self
1259                .repetition_level_histogram
1260                .as_ref()
1261                .map(|hist| hist.clone().into_inner());
1262
1263            let definition_level_histogram = self
1264                .definition_level_histogram
1265                .as_ref()
1266                .map(|hist| hist.clone().into_inner());
1267
1268            Some(SizeStatistics {
1269                unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1270                repetition_level_histogram,
1271                definition_level_histogram,
1272            })
1273        } else {
1274            None
1275        };
1276
1277        ColumnMetaData {
1278            type_: self.column_type().into(),
1279            encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1280            path_in_schema: self.column_path().as_ref().to_vec(),
1281            codec: self.compression.into(),
1282            num_values: self.num_values,
1283            total_uncompressed_size: self.total_uncompressed_size,
1284            total_compressed_size: self.total_compressed_size,
1285            key_value_metadata: None,
1286            data_page_offset: self.data_page_offset,
1287            index_page_offset: self.index_page_offset,
1288            dictionary_page_offset: self.dictionary_page_offset,
1289            statistics: statistics::to_thrift(self.statistics.as_ref()),
1290            encoding_stats: self
1291                .encoding_stats
1292                .as_ref()
1293                .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1294            bloom_filter_offset: self.bloom_filter_offset,
1295            bloom_filter_length: self.bloom_filter_length,
1296            size_statistics,
1297            geospatial_statistics: None,
1298        }
1299    }
1300
1301    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1302    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1303        ColumnChunkMetaDataBuilder::from(self)
1304    }
1305
1306    #[cfg(feature = "encryption")]
1307    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1308        self.column_crypto_metadata
1309            .as_ref()
1310            .map(column_crypto_metadata::to_thrift)
1311    }
1312
1313    #[cfg(not(feature = "encryption"))]
1314    fn column_crypto_metadata_thrift(&self) -> Option<TColumnCryptoMetaData> {
1315        None
1316    }
1317}
1318
1319/// Builder for [`ColumnChunkMetaData`]
1320///
1321/// This builder is used to create a new column chunk metadata or modify an
1322/// existing one.
1323///
1324/// # Example
1325/// ```no_run
1326/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1327/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1328/// let column_chunk_metadata = get_column_chunk_metadata();
1329/// // create a new builder from existing column chunk metadata
1330/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1331/// // clear the statistics:
1332/// let column_chunk_metadata: ColumnChunkMetaData = builder
1333///   .clear_statistics()
1334///   .build()
1335///   .unwrap();
1336/// ```
1337pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1338
1339impl ColumnChunkMetaDataBuilder {
1340    /// Creates new column chunk metadata builder.
1341    ///
1342    /// See also [`ColumnChunkMetaData::builder`]
1343    fn new(column_descr: ColumnDescPtr) -> Self {
1344        Self(ColumnChunkMetaData {
1345            column_descr,
1346            encodings: Vec::new(),
1347            file_path: None,
1348            file_offset: 0,
1349            num_values: 0,
1350            compression: Compression::UNCOMPRESSED,
1351            total_compressed_size: 0,
1352            total_uncompressed_size: 0,
1353            data_page_offset: 0,
1354            index_page_offset: None,
1355            dictionary_page_offset: None,
1356            statistics: None,
1357            encoding_stats: None,
1358            bloom_filter_offset: None,
1359            bloom_filter_length: None,
1360            offset_index_offset: None,
1361            offset_index_length: None,
1362            column_index_offset: None,
1363            column_index_length: None,
1364            unencoded_byte_array_data_bytes: None,
1365            repetition_level_histogram: None,
1366            definition_level_histogram: None,
1367            #[cfg(feature = "encryption")]
1368            column_crypto_metadata: None,
1369        })
1370    }
1371
1372    /// Sets list of encodings for this column chunk.
1373    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1374        self.0.encodings = encodings;
1375        self
1376    }
1377
1378    /// Sets optional file path for this column chunk.
1379    pub fn set_file_path(mut self, value: String) -> Self {
1380        self.0.file_path = Some(value);
1381        self
1382    }
1383
1384    /// Sets number of values.
1385    pub fn set_num_values(mut self, value: i64) -> Self {
1386        self.0.num_values = value;
1387        self
1388    }
1389
1390    /// Sets compression.
1391    pub fn set_compression(mut self, value: Compression) -> Self {
1392        self.0.compression = value;
1393        self
1394    }
1395
1396    /// Sets total compressed size in bytes.
1397    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1398        self.0.total_compressed_size = value;
1399        self
1400    }
1401
1402    /// Sets total uncompressed size in bytes.
1403    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1404        self.0.total_uncompressed_size = value;
1405        self
1406    }
1407
1408    /// Sets data page offset in bytes.
1409    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1410        self.0.data_page_offset = value;
1411        self
1412    }
1413
1414    /// Sets optional dictionary page offset in bytes.
1415    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1416        self.0.dictionary_page_offset = value;
1417        self
1418    }
1419
1420    /// Sets optional index page offset in bytes.
1421    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1422        self.0.index_page_offset = value;
1423        self
1424    }
1425
1426    /// Sets statistics for this column chunk.
1427    pub fn set_statistics(mut self, value: Statistics) -> Self {
1428        self.0.statistics = Some(value);
1429        self
1430    }
1431
1432    /// Clears the statistics for this column chunk.
1433    pub fn clear_statistics(mut self) -> Self {
1434        self.0.statistics = None;
1435        self
1436    }
1437
1438    /// Sets page encoding stats for this column chunk.
1439    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1440        self.0.encoding_stats = Some(value);
1441        self
1442    }
1443
1444    /// Clears the page encoding stats for this column chunk.
1445    pub fn clear_page_encoding_stats(mut self) -> Self {
1446        self.0.encoding_stats = None;
1447        self
1448    }
1449
1450    /// Sets optional bloom filter offset in bytes.
1451    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1452        self.0.bloom_filter_offset = value;
1453        self
1454    }
1455
1456    /// Sets optional bloom filter length in bytes.
1457    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1458        self.0.bloom_filter_length = value;
1459        self
1460    }
1461
1462    /// Sets optional offset index offset in bytes.
1463    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1464        self.0.offset_index_offset = value;
1465        self
1466    }
1467
1468    /// Sets optional offset index length in bytes.
1469    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1470        self.0.offset_index_length = value;
1471        self
1472    }
1473
1474    /// Sets optional column index offset in bytes.
1475    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1476        self.0.column_index_offset = value;
1477        self
1478    }
1479
1480    /// Sets optional column index length in bytes.
1481    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1482        self.0.column_index_length = value;
1483        self
1484    }
1485
1486    /// Sets optional length of variable length data in bytes.
1487    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1488        self.0.unencoded_byte_array_data_bytes = value;
1489        self
1490    }
1491
1492    /// Sets optional repetition level histogram
1493    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1494        self.0.repetition_level_histogram = value;
1495        self
1496    }
1497
1498    /// Sets optional repetition level histogram
1499    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1500        self.0.definition_level_histogram = value;
1501        self
1502    }
1503
1504    #[cfg(feature = "encryption")]
1505    /// Set the encryption metadata for an encrypted column
1506    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1507        self.0.column_crypto_metadata = value;
1508        self
1509    }
1510
1511    /// Builds column chunk metadata.
1512    pub fn build(self) -> Result<ColumnChunkMetaData> {
1513        Ok(self.0)
1514    }
1515}
1516
1517/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1518///
1519/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1520pub struct ColumnIndexBuilder {
1521    null_pages: Vec<bool>,
1522    min_values: Vec<Vec<u8>>,
1523    max_values: Vec<Vec<u8>>,
1524    null_counts: Vec<i64>,
1525    boundary_order: BoundaryOrder,
1526    /// contains the concatenation of the histograms of all pages
1527    repetition_level_histograms: Option<Vec<i64>>,
1528    /// contains the concatenation of the histograms of all pages
1529    definition_level_histograms: Option<Vec<i64>>,
1530    /// Is the information in the builder valid?
1531    ///
1532    /// Set to `false` if any entry in the page doesn't have statistics for
1533    /// some reason, so statistics for that page won't be written to the file.
1534    /// This might happen if the page is entirely null, or
1535    /// is a floating point column without any non-nan values
1536    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1537    valid: bool,
1538}
1539
1540impl Default for ColumnIndexBuilder {
1541    fn default() -> Self {
1542        Self::new()
1543    }
1544}
1545
1546impl ColumnIndexBuilder {
1547    /// Creates a new column index builder.
1548    pub fn new() -> Self {
1549        ColumnIndexBuilder {
1550            null_pages: Vec::new(),
1551            min_values: Vec::new(),
1552            max_values: Vec::new(),
1553            null_counts: Vec::new(),
1554            boundary_order: BoundaryOrder::UNORDERED,
1555            repetition_level_histograms: None,
1556            definition_level_histograms: None,
1557            valid: true,
1558        }
1559    }
1560
1561    /// Append statistics for the next page
1562    pub fn append(
1563        &mut self,
1564        null_page: bool,
1565        min_value: Vec<u8>,
1566        max_value: Vec<u8>,
1567        null_count: i64,
1568    ) {
1569        self.null_pages.push(null_page);
1570        self.min_values.push(min_value);
1571        self.max_values.push(max_value);
1572        self.null_counts.push(null_count);
1573    }
1574
1575    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1576    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1577    pub fn append_histograms(
1578        &mut self,
1579        repetition_level_histogram: &Option<LevelHistogram>,
1580        definition_level_histogram: &Option<LevelHistogram>,
1581    ) {
1582        if !self.valid {
1583            return;
1584        }
1585        if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1586            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1587            hist.reserve(rep_lvl_hist.len());
1588            hist.extend(rep_lvl_hist.values());
1589        }
1590        if let Some(ref def_lvl_hist) = definition_level_histogram {
1591            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1592            hist.reserve(def_lvl_hist.len());
1593            hist.extend(def_lvl_hist.values());
1594        }
1595    }
1596
1597    /// Set the boundary order of the column index
1598    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1599        self.boundary_order = boundary_order;
1600    }
1601
1602    /// Mark this column index as invalid
1603    pub fn to_invalid(&mut self) {
1604        self.valid = false;
1605    }
1606
1607    /// Is the information in the builder valid?
1608    pub fn valid(&self) -> bool {
1609        self.valid
1610    }
1611
1612    /// Build and get the thrift metadata of column index
1613    ///
1614    /// Note: callers should check [`Self::valid`] before calling this method
1615    pub fn build_to_thrift(self) -> ColumnIndex {
1616        ColumnIndex::new(
1617            self.null_pages,
1618            self.min_values,
1619            self.max_values,
1620            self.boundary_order,
1621            self.null_counts,
1622            self.repetition_level_histograms,
1623            self.definition_level_histograms,
1624        )
1625    }
1626}
1627
1628impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1629    fn from(value: ColumnChunkMetaData) -> Self {
1630        ColumnChunkMetaDataBuilder(value)
1631    }
1632}
1633
1634/// Builder for offset index, part of the Parquet [PageIndex].
1635///
1636/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1637pub struct OffsetIndexBuilder {
1638    offset_array: Vec<i64>,
1639    compressed_page_size_array: Vec<i32>,
1640    first_row_index_array: Vec<i64>,
1641    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1642    current_first_row_index: i64,
1643}
1644
1645impl Default for OffsetIndexBuilder {
1646    fn default() -> Self {
1647        Self::new()
1648    }
1649}
1650
1651impl OffsetIndexBuilder {
1652    /// Creates a new offset index builder.
1653    pub fn new() -> Self {
1654        OffsetIndexBuilder {
1655            offset_array: Vec::new(),
1656            compressed_page_size_array: Vec::new(),
1657            first_row_index_array: Vec::new(),
1658            unencoded_byte_array_data_bytes_array: None,
1659            current_first_row_index: 0,
1660        }
1661    }
1662
1663    /// Append the row count of the next page.
1664    pub fn append_row_count(&mut self, row_count: i64) {
1665        let current_page_row_index = self.current_first_row_index;
1666        self.first_row_index_array.push(current_page_row_index);
1667        self.current_first_row_index += row_count;
1668    }
1669
1670    /// Append the offset and size of the next page.
1671    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1672        self.offset_array.push(offset);
1673        self.compressed_page_size_array.push(compressed_page_size);
1674    }
1675
1676    /// Append the unencoded byte array data bytes of the next page.
1677    pub fn append_unencoded_byte_array_data_bytes(
1678        &mut self,
1679        unencoded_byte_array_data_bytes: Option<i64>,
1680    ) {
1681        if let Some(val) = unencoded_byte_array_data_bytes {
1682            self.unencoded_byte_array_data_bytes_array
1683                .get_or_insert(Vec::new())
1684                .push(val);
1685        }
1686    }
1687
1688    /// Build and get the thrift metadata of offset index
1689    pub fn build_to_thrift(self) -> OffsetIndex {
1690        let locations = self
1691            .offset_array
1692            .iter()
1693            .zip(self.compressed_page_size_array.iter())
1694            .zip(self.first_row_index_array.iter())
1695            .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1696            .collect::<Vec<_>>();
1697        OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1698    }
1699}
1700
1701#[cfg(test)]
1702mod tests {
1703    use super::*;
1704    use crate::basic::{PageType, SortOrder};
1705    use crate::file::page_index::index::NativeIndex;
1706
1707    #[test]
1708    fn test_row_group_metadata_thrift_conversion() {
1709        let schema_descr = get_test_schema_descr();
1710
1711        let mut columns = vec![];
1712        for ptr in schema_descr.columns() {
1713            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1714            columns.push(column);
1715        }
1716        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1717            .set_num_rows(1000)
1718            .set_total_byte_size(2000)
1719            .set_column_metadata(columns)
1720            .set_ordinal(1)
1721            .build()
1722            .unwrap();
1723
1724        let row_group_exp = row_group_meta.to_thrift();
1725        let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1726            .unwrap()
1727            .to_thrift();
1728
1729        assert_eq!(row_group_res, row_group_exp);
1730    }
1731
1732    #[test]
1733    fn test_row_group_metadata_thrift_conversion_empty() {
1734        let schema_descr = get_test_schema_descr();
1735
1736        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1737
1738        assert!(row_group_meta.is_err());
1739        if let Err(e) = row_group_meta {
1740            assert_eq!(
1741                format!("{e}"),
1742                "Parquet error: Column length mismatch: 2 != 0"
1743            );
1744        }
1745    }
1746
1747    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1748    #[test]
1749    fn test_row_group_metadata_thrift_corrupted() {
1750        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1751            SchemaType::group_type_builder("schema")
1752                .with_fields(vec![
1753                    Arc::new(
1754                        SchemaType::primitive_type_builder("a", Type::INT32)
1755                            .build()
1756                            .unwrap(),
1757                    ),
1758                    Arc::new(
1759                        SchemaType::primitive_type_builder("b", Type::INT32)
1760                            .build()
1761                            .unwrap(),
1762                    ),
1763                ])
1764                .build()
1765                .unwrap(),
1766        )));
1767
1768        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1769            SchemaType::group_type_builder("schema")
1770                .with_fields(vec![
1771                    Arc::new(
1772                        SchemaType::primitive_type_builder("a", Type::INT32)
1773                            .build()
1774                            .unwrap(),
1775                    ),
1776                    Arc::new(
1777                        SchemaType::primitive_type_builder("b", Type::INT32)
1778                            .build()
1779                            .unwrap(),
1780                    ),
1781                    Arc::new(
1782                        SchemaType::primitive_type_builder("c", Type::INT32)
1783                            .build()
1784                            .unwrap(),
1785                    ),
1786                ])
1787                .build()
1788                .unwrap(),
1789        )));
1790
1791        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1792            .set_num_rows(1000)
1793            .set_total_byte_size(2000)
1794            .set_column_metadata(vec![
1795                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1796                    .build()
1797                    .unwrap(),
1798                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1799                    .build()
1800                    .unwrap(),
1801            ])
1802            .set_ordinal(1)
1803            .build()
1804            .unwrap();
1805
1806        let err =
1807            RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1808                .unwrap_err()
1809                .to_string();
1810        assert_eq!(
1811            err,
1812            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1813        );
1814    }
1815
1816    #[test]
1817    fn test_column_chunk_metadata_thrift_conversion() {
1818        let column_descr = get_test_schema_descr().column(0);
1819
1820        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1821            .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1822            .set_file_path("file_path".to_owned())
1823            .set_num_values(1000)
1824            .set_compression(Compression::SNAPPY)
1825            .set_total_compressed_size(2000)
1826            .set_total_uncompressed_size(3000)
1827            .set_data_page_offset(4000)
1828            .set_dictionary_page_offset(Some(5000))
1829            .set_page_encoding_stats(vec![
1830                PageEncodingStats {
1831                    page_type: PageType::DATA_PAGE,
1832                    encoding: Encoding::PLAIN,
1833                    count: 3,
1834                },
1835                PageEncodingStats {
1836                    page_type: PageType::DATA_PAGE,
1837                    encoding: Encoding::RLE,
1838                    count: 5,
1839                },
1840            ])
1841            .set_bloom_filter_offset(Some(6000))
1842            .set_bloom_filter_length(Some(25))
1843            .set_offset_index_offset(Some(7000))
1844            .set_offset_index_length(Some(25))
1845            .set_column_index_offset(Some(8000))
1846            .set_column_index_length(Some(25))
1847            .set_unencoded_byte_array_data_bytes(Some(2000))
1848            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1849            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1850            .build()
1851            .unwrap();
1852
1853        let col_chunk_res =
1854            ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1855
1856        assert_eq!(col_chunk_res, col_metadata);
1857    }
1858
1859    #[test]
1860    fn test_column_chunk_metadata_thrift_conversion_empty() {
1861        let column_descr = get_test_schema_descr().column(0);
1862
1863        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1864            .build()
1865            .unwrap();
1866
1867        let col_chunk_exp = col_metadata.to_thrift();
1868        let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1869            .unwrap()
1870            .to_thrift();
1871
1872        assert_eq!(col_chunk_res, col_chunk_exp);
1873    }
1874
1875    #[test]
1876    fn test_compressed_size() {
1877        let schema_descr = get_test_schema_descr();
1878
1879        let mut columns = vec![];
1880        for column_descr in schema_descr.columns() {
1881            let column = ColumnChunkMetaData::builder(column_descr.clone())
1882                .set_total_compressed_size(500)
1883                .set_total_uncompressed_size(700)
1884                .build()
1885                .unwrap();
1886            columns.push(column);
1887        }
1888        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1889            .set_num_rows(1000)
1890            .set_column_metadata(columns)
1891            .build()
1892            .unwrap();
1893
1894        let compressed_size_res: i64 = row_group_meta.compressed_size();
1895        let compressed_size_exp: i64 = 1000;
1896
1897        assert_eq!(compressed_size_res, compressed_size_exp);
1898    }
1899
1900    #[test]
1901    fn test_memory_size() {
1902        let schema_descr = get_test_schema_descr();
1903
1904        let columns = schema_descr
1905            .columns()
1906            .iter()
1907            .map(|column_descr| {
1908                ColumnChunkMetaData::builder(column_descr.clone())
1909                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1910                    .build()
1911            })
1912            .collect::<Result<Vec<_>>>()
1913            .unwrap();
1914        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1915            .set_num_rows(1000)
1916            .set_column_metadata(columns)
1917            .build()
1918            .unwrap();
1919        let row_group_meta = vec![row_group_meta];
1920
1921        let version = 2;
1922        let num_rows = 1000;
1923        let created_by = Some(String::from("test harness"));
1924        let key_value_metadata = Some(vec![KeyValue::new(
1925            String::from("Foo"),
1926            Some(String::from("bar")),
1927        )]);
1928        let column_orders = Some(vec![
1929            ColumnOrder::UNDEFINED,
1930            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1931        ]);
1932        let file_metadata = FileMetaData::new(
1933            version,
1934            num_rows,
1935            created_by,
1936            key_value_metadata,
1937            schema_descr.clone(),
1938            column_orders,
1939        );
1940
1941        // Now, add in Exact Statistics
1942        let columns_with_stats = schema_descr
1943            .columns()
1944            .iter()
1945            .map(|column_descr| {
1946                ColumnChunkMetaData::builder(column_descr.clone())
1947                    .set_statistics(Statistics::new::<i32>(
1948                        Some(0),
1949                        Some(100),
1950                        None,
1951                        None,
1952                        false,
1953                    ))
1954                    .build()
1955            })
1956            .collect::<Result<Vec<_>>>()
1957            .unwrap();
1958
1959        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1960            .set_num_rows(1000)
1961            .set_column_metadata(columns_with_stats)
1962            .build()
1963            .unwrap();
1964        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1965
1966        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1967            .set_row_groups(row_group_meta_with_stats)
1968            .build();
1969
1970        #[cfg(not(feature = "encryption"))]
1971        let base_expected_size = 2312;
1972        #[cfg(feature = "encryption")]
1973        let base_expected_size = 2648;
1974
1975        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1976
1977        let mut column_index = ColumnIndexBuilder::new();
1978        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1979        let column_index = column_index.build_to_thrift();
1980        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
1981
1982        // Now, add in OffsetIndex
1983        let mut offset_index = OffsetIndexBuilder::new();
1984        offset_index.append_row_count(1);
1985        offset_index.append_offset_and_size(2, 3);
1986        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1987        offset_index.append_row_count(1);
1988        offset_index.append_offset_and_size(2, 3);
1989        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1990        let offset_index = offset_index.build_to_thrift();
1991
1992        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1993            .set_row_groups(row_group_meta)
1994            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
1995            .set_offset_index(Some(vec![vec![
1996                OffsetIndexMetaData::try_new(offset_index).unwrap()
1997            ]]))
1998            .build();
1999
2000        #[cfg(not(feature = "encryption"))]
2001        let bigger_expected_size = 2816;
2002        #[cfg(feature = "encryption")]
2003        let bigger_expected_size = 3152;
2004
2005        // more set fields means more memory usage
2006        assert!(bigger_expected_size > base_expected_size);
2007        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
2008    }
2009
2010    /// Returns sample schema descriptor so we can create column metadata.
2011    fn get_test_schema_descr() -> SchemaDescPtr {
2012        let schema = SchemaType::group_type_builder("schema")
2013            .with_fields(vec![
2014                Arc::new(
2015                    SchemaType::primitive_type_builder("a", Type::INT32)
2016                        .build()
2017                        .unwrap(),
2018                ),
2019                Arc::new(
2020                    SchemaType::primitive_type_builder("b", Type::INT32)
2021                        .build()
2022                        .unwrap(),
2023                ),
2024            ])
2025            .build()
2026            .unwrap();
2027
2028        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2029    }
2030}
parquet/file/metadata/mod.rs

parquet/file/metadata/
mod.rs