parquet/file/metadata/
options.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Options used to control metadata parsing
19
20use std::collections::HashSet;
21use std::sync::Arc;
22
23use crate::schema::types::SchemaDescPtr;
24
25/// Enum to control decoding of some Parquet statistics fields.
26///
27/// # Example
28/// ```rust
29/// use parquet::file::metadata::ParquetStatisticsPolicy;
30/// use parquet::file::serialized_reader::ReadOptionsBuilder;
31/// use parquet::arrow::arrow_reader::ArrowReaderOptions;
32///
33/// // Set arrow options to skip encoding statistics for all columns.
34/// let options =
35///     ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
36///
37/// // Set serialized reader options to decode encoding statistics for all columns.
38/// let options =
39///     ReadOptionsBuilder::new().with_encoding_stats_policy(ParquetStatisticsPolicy::KeepAll)
40///     .build();
41///
42/// // Set arrow options to skip encoding statistics for all columns, but to decode statistics
43/// // for columns 0 and 1.
44/// let options = ArrowReaderOptions::new()
45///     .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0, 1]));
46/// ```
47#[derive(Default, Debug, Clone)]
48pub enum ParquetStatisticsPolicy {
49    /// Decode the relevant statistics for all columns.
50    #[default]
51    KeepAll,
52    /// Skip decoding the relevant statistics for all columns.
53    SkipAll,
54    /// Skip decoding the relevant statistics for all columns not in the provided set
55    /// of column indices.
56    SkipExcept(Arc<HashSet<usize>>),
57}
58
59impl ParquetStatisticsPolicy {
60    /// Create a `ParquetStatisticsPolicy` to skip all columns except those in `keep`.
61    ///
62    /// If `keep` is empty, then this returns [`Self::SkipAll`]
63    pub fn skip_except(keep: &[usize]) -> Self {
64        if keep.is_empty() {
65            Self::SkipAll
66        } else {
67            let mut keep_set = HashSet::<usize>::with_capacity(keep.len());
68            keep_set.extend(keep.iter());
69            Self::SkipExcept(Arc::new(keep_set))
70        }
71    }
72
73    /// Returns whether the policy for the given column index is to skip the statistics.
74    pub(crate) fn is_skip(&self, col_index: usize) -> bool {
75        match self {
76            Self::KeepAll => false,
77            Self::SkipAll => true,
78            Self::SkipExcept(keep) => !keep.contains(&col_index),
79        }
80    }
81}
82
83/// Options that can be set to control what parts of the Parquet file footer
84/// metadata will be decoded and made present in the [`ParquetMetaData`] returned
85/// by [`ParquetMetaDataReader`] and [`ParquetMetaDataPushDecoder`].
86///
87/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
88/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
89/// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder
90#[derive(Default, Debug, Clone)]
91pub struct ParquetMetaDataOptions {
92    schema_descr: Option<SchemaDescPtr>,
93    encoding_stats_as_mask: bool,
94    encoding_stats_policy: ParquetStatisticsPolicy,
95}
96
97impl ParquetMetaDataOptions {
98    /// Return a new default [`ParquetMetaDataOptions`].
99    pub fn new() -> Self {
100        Default::default()
101    }
102
103    /// Returns an optional [`SchemaDescPtr`] to use when decoding. If this is not `None` then
104    /// the schema in the footer will be skipped.
105    pub fn schema(&self) -> Option<&SchemaDescPtr> {
106        self.schema_descr.as_ref()
107    }
108
109    /// Provide a schema to use when decoding the metadata.
110    pub fn set_schema(&mut self, val: SchemaDescPtr) {
111        self.schema_descr = Some(val);
112    }
113
114    /// Call [`Self::set_schema`] and return `Self` for chaining.
115    pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
116        self.set_schema(val);
117        self
118    }
119
120    /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData`
121    /// as a bitmask (defaults to `false`).
122    ///
123    /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
124    /// might be desirable.
125    ///
126    /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
127    /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
128    /// [`encoding_stats`]:
129    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
130    pub fn encoding_stats_as_mask(&self) -> bool {
131        self.encoding_stats_as_mask
132    }
133
134    /// Convert [`encoding_stats`] from a vector of [`PageEncodingStats`] to a bitmask. This can
135    /// speed up metadata decoding while still enabling some use cases served by the full stats.
136    ///
137    /// Note that if for a given column both this option and `skip_encoding_stats` are `true`, the
138    /// stats will be skipped and not be returned as a mask.
139    ///
140    /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information.
141    ///
142    /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats
143    /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
144    /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
145    /// [`encoding_stats`]:
146    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
147    pub fn set_encoding_stats_as_mask(&mut self, val: bool) {
148        self.encoding_stats_as_mask = val;
149    }
150
151    /// Call [`Self::set_encoding_stats_as_mask`] and return `Self` for chaining.
152    pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
153        self.set_encoding_stats_as_mask(val);
154        self
155    }
156
157    /// Returns whether to skip decoding the [`encoding_stats`] in the Parquet `ColumnMetaData`
158    /// for the column indexed by `col_index`.
159    ///
160    /// [`encoding_stats`]:
161    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
162    pub fn skip_encoding_stats(&self, col_index: usize) -> bool {
163        self.encoding_stats_policy.is_skip(col_index)
164    }
165
166    /// Sets the decoding policy for [`encoding_stats`] in the Parquet `ColumnMetaData`.
167    ///
168    /// The default policy is to decode all `encoding_stats`.
169    ///
170    /// This option takes precedence over [`Self::encoding_stats_as_mask`].
171    ///
172    /// [`encoding_stats`]:
173    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
174    pub fn set_encoding_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
175        self.encoding_stats_policy = policy;
176    }
177
178    /// Call [`Self::set_encoding_stats_policy`] and return `Self` for chaining.
179    pub fn with_encoding_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
180        self.set_encoding_stats_policy(policy);
181        self
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use bytes::Bytes;
188
189    use crate::{
190        DecodeResult,
191        file::metadata::{ParquetMetaDataOptions, ParquetMetaDataPushDecoder},
192        util::test_common::file_util::get_test_file,
193    };
194    use std::{io::Read, sync::Arc};
195
196    #[test]
197    fn test_provide_schema() {
198        let mut buf: Vec<u8> = Vec::new();
199        get_test_file("alltypes_plain.parquet")
200            .read_to_end(&mut buf)
201            .unwrap();
202
203        let data = Bytes::from(buf);
204        let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap();
205        decoder
206            .push_range(0..data.len() as u64, data.clone())
207            .unwrap();
208
209        let expected = match decoder.try_decode().unwrap() {
210            DecodeResult::Data(m) => m,
211            _ => panic!("could not parse metadata"),
212        };
213        let expected_schema = expected.file_metadata().schema_descr_ptr();
214
215        let mut options = ParquetMetaDataOptions::new();
216        options.set_schema(expected_schema);
217        let options = Arc::new(options);
218
219        let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64)
220            .unwrap()
221            .with_metadata_options(Some(options));
222        decoder.push_range(0..data.len() as u64, data).unwrap();
223        let metadata = match decoder.try_decode().unwrap() {
224            DecodeResult::Data(m) => m,
225            _ => panic!("could not parse metadata"),
226        };
227
228        assert_eq!(expected, metadata);
229        // the schema pointers should be the same
230        assert!(Arc::ptr_eq(
231            &expected.file_metadata().schema_descr_ptr(),
232            &metadata.file_metadata().schema_descr_ptr()
233        ));
234    }
235}