parquet/file/metadata/
options.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Options used to control metadata parsing
19
20use std::collections::HashSet;
21use std::sync::Arc;
22
23use crate::schema::types::SchemaDescPtr;
24
25/// Enum to control decoding of some Parquet statistics fields.
26///
27/// # Example
28/// ```rust
29/// use parquet::file::metadata::ParquetStatisticsPolicy;
30/// use parquet::file::serialized_reader::ReadOptionsBuilder;
31/// use parquet::arrow::arrow_reader::ArrowReaderOptions;
32///
33/// // Set arrow options to skip encoding statistics for all columns.
34/// let options =
35///     ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
36///
37/// // Set serialized reader options to decode encoding statistics for all columns.
38/// let options =
39///     ReadOptionsBuilder::new().with_encoding_stats_policy(ParquetStatisticsPolicy::KeepAll)
40///     .build();
41///
42/// // Set arrow options to skip encoding statistics for all columns, but to decode statistics
43/// // for columns 0 and 1.
44/// let options = ArrowReaderOptions::new()
45///     .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0, 1]));
46/// ```
47#[derive(Default, Debug, Clone)]
48pub enum ParquetStatisticsPolicy {
49    /// Decode the relevant statistics for all columns.
50    #[default]
51    KeepAll,
52    /// Skip decoding the relevant statistics for all columns.
53    SkipAll,
54    /// Skip decoding the relevant statistics for all columns not in the provided set
55    /// of column indices.
56    SkipExcept(Arc<HashSet<usize>>),
57}
58
59impl ParquetStatisticsPolicy {
60    /// Create a `ParquetStatisticsPolicy` to skip all columns except those in `keep`.
61    ///
62    /// If `keep` is empty, then this returns [`Self::SkipAll`]
63    pub fn skip_except(keep: &[usize]) -> Self {
64        if keep.is_empty() {
65            Self::SkipAll
66        } else {
67            let mut keep_set = HashSet::<usize>::with_capacity(keep.len());
68            keep_set.extend(keep.iter());
69            Self::SkipExcept(Arc::new(keep_set))
70        }
71    }
72
73    /// Returns whether the policy for the given column index is to skip the statistics.
74    pub(crate) fn is_skip(&self, col_index: usize) -> bool {
75        match self {
76            Self::KeepAll => false,
77            Self::SkipAll => true,
78            Self::SkipExcept(keep) => !keep.contains(&col_index),
79        }
80    }
81}
82
83/// Options that can be set to control what parts of the Parquet file footer
84/// metadata will be decoded and made present in the [`ParquetMetaData`] returned
85/// by [`ParquetMetaDataReader`] and [`ParquetMetaDataPushDecoder`].
86///
87/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
88/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
89/// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder
90#[derive(Debug, Clone)]
91pub struct ParquetMetaDataOptions {
92    schema_descr: Option<SchemaDescPtr>,
93    encoding_stats_as_mask: bool,
94    encoding_stats_policy: ParquetStatisticsPolicy,
95    column_stats_policy: ParquetStatisticsPolicy,
96    size_stats_policy: ParquetStatisticsPolicy,
97}
98
99impl Default for ParquetMetaDataOptions {
100    fn default() -> Self {
101        Self {
102            schema_descr: None,
103            encoding_stats_as_mask: true,
104            encoding_stats_policy: ParquetStatisticsPolicy::KeepAll,
105            column_stats_policy: ParquetStatisticsPolicy::KeepAll,
106            size_stats_policy: ParquetStatisticsPolicy::KeepAll,
107        }
108    }
109}
110
111impl ParquetMetaDataOptions {
112    /// Return a new default [`ParquetMetaDataOptions`].
113    pub fn new() -> Self {
114        Default::default()
115    }
116
117    /// Returns an optional [`SchemaDescPtr`] to use when decoding. If this is not `None` then
118    /// the schema in the footer will be skipped.
119    pub fn schema(&self) -> Option<&SchemaDescPtr> {
120        self.schema_descr.as_ref()
121    }
122
123    /// Provide a schema to use when decoding the metadata.
124    pub fn set_schema(&mut self, val: SchemaDescPtr) {
125        self.schema_descr = Some(val);
126    }
127
128    /// Call [`Self::set_schema`] and return `Self` for chaining.
129    pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
130        self.set_schema(val);
131        self
132    }
133
134    /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData`
135    /// as a bitmask (defaults to `true`).
136    ///
137    /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
138    /// might be desirable.
139    ///
140    /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
141    /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
142    /// [`encoding_stats`]:
143    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
144    pub fn encoding_stats_as_mask(&self) -> bool {
145        self.encoding_stats_as_mask
146    }
147
148    /// Convert [`encoding_stats`] from a vector of [`PageEncodingStats`] to a bitmask. This can
149    /// speed up metadata decoding while still enabling some use cases served by the full stats.
150    ///
151    /// Note that if for a given column both this option and `skip_encoding_stats` are `true`, the
152    /// stats will be skipped and not be returned as a mask.
153    ///
154    /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information.
155    ///
156    /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats
157    /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
158    /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
159    /// [`encoding_stats`]:
160    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
161    pub fn set_encoding_stats_as_mask(&mut self, val: bool) {
162        self.encoding_stats_as_mask = val;
163    }
164
165    /// Call [`Self::set_encoding_stats_as_mask`] and return `Self` for chaining.
166    pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
167        self.set_encoding_stats_as_mask(val);
168        self
169    }
170
171    /// Returns whether to skip decoding the [`encoding_stats`] in the Parquet `ColumnMetaData`
172    /// for the column indexed by `col_index`.
173    ///
174    /// [`encoding_stats`]:
175    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
176    pub fn skip_encoding_stats(&self, col_index: usize) -> bool {
177        self.encoding_stats_policy.is_skip(col_index)
178    }
179
180    /// Sets the decoding policy for [`encoding_stats`] in the Parquet `ColumnMetaData`.
181    ///
182    /// The default policy is to decode all `encoding_stats`.
183    ///
184    /// This option takes precedence over [`Self::encoding_stats_as_mask`].
185    ///
186    /// [`encoding_stats`]:
187    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
188    pub fn set_encoding_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
189        self.encoding_stats_policy = policy;
190    }
191
192    /// Call [`Self::set_encoding_stats_policy`] and return `Self` for chaining.
193    pub fn with_encoding_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
194        self.set_encoding_stats_policy(policy);
195        self
196    }
197
198    /// Returns whether to skip decoding the [`statistics`] in the Parquet `ColumnMetaData`
199    /// for the column indexed by `col_index`.
200    ///
201    /// [`statistics`]:
202    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912
203    pub fn skip_column_stats(&self, col_index: usize) -> bool {
204        self.column_stats_policy.is_skip(col_index)
205    }
206
207    /// Sets the decoding policy for [`statistics`] in the Parquet `ColumnMetaData`.
208    ///
209    /// The default policy is to decode all `statistics`.
210    ///
211    /// [`statistics`]:
212    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912
213    pub fn set_column_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
214        self.column_stats_policy = policy;
215    }
216
217    /// Call [`Self::set_column_stats_policy`] and return `Self` for chaining.
218    pub fn with_column_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
219        self.set_column_stats_policy(policy);
220        self
221    }
222
223    /// Returns whether to skip decoding the [`size_statistics`] in the Parquet `ColumnMetaData`
224    /// for the column indexed by `col_index`.
225    ///
226    /// [`size_statistics`]:
227    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936
228    pub fn skip_size_stats(&self, col_index: usize) -> bool {
229        self.size_stats_policy.is_skip(col_index)
230    }
231
232    /// Sets the decoding policy for [`size_statistics`] in the Parquet `ColumnMetaData`.
233    ///
234    /// The default policy is to decode all `size_statistics`.
235    ///
236    /// [`size_statistics`]:
237    /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936
238    pub fn set_size_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
239        self.size_stats_policy = policy;
240    }
241
242    /// Call [`Self::set_size_stats_policy`] and return `Self` for chaining.
243    pub fn with_size_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
244        self.set_size_stats_policy(policy);
245        self
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use bytes::Bytes;
252
253    use crate::{
254        DecodeResult,
255        file::metadata::{ParquetMetaDataOptions, ParquetMetaDataPushDecoder},
256        util::test_common::file_util::get_test_file,
257    };
258    use std::{io::Read, sync::Arc};
259
260    #[test]
261    fn test_options_default() {
262        let options = ParquetMetaDataOptions::default();
263        assert!(options.encoding_stats_as_mask());
264    }
265
266    #[test]
267    fn test_provide_schema() {
268        let mut buf: Vec<u8> = Vec::new();
269        get_test_file("alltypes_plain.parquet")
270            .read_to_end(&mut buf)
271            .unwrap();
272
273        let data = Bytes::from(buf);
274        let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap();
275        decoder
276            .push_range(0..data.len() as u64, data.clone())
277            .unwrap();
278
279        let expected = match decoder.try_decode().unwrap() {
280            DecodeResult::Data(m) => m,
281            _ => panic!("could not parse metadata"),
282        };
283        let expected_schema = expected.file_metadata().schema_descr_ptr();
284
285        let mut options = ParquetMetaDataOptions::new();
286        options.set_schema(expected_schema);
287        let options = Arc::new(options);
288
289        let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64)
290            .unwrap()
291            .with_metadata_options(Some(options));
292        decoder.push_range(0..data.len() as u64, data).unwrap();
293        let metadata = match decoder.try_decode().unwrap() {
294            DecodeResult::Data(m) => m,
295            _ => panic!("could not parse metadata"),
296        };
297
298        assert_eq!(expected, metadata);
299        // the schema pointers should be the same
300        assert!(Arc::ptr_eq(
301            &expected.file_metadata().schema_descr_ptr(),
302            &metadata.file_metadata().schema_descr_ptr()
303        ));
304    }
305}