parquet/file/metadata/options.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Options used to control metadata parsing
19
20use std::collections::HashSet;
21use std::sync::Arc;
22
23use crate::schema::types::SchemaDescPtr;
24
25/// Enum to control decoding of some Parquet statistics fields.
26///
27/// # Example
28/// ```rust
29/// use parquet::file::metadata::ParquetStatisticsPolicy;
30/// use parquet::file::serialized_reader::ReadOptionsBuilder;
31/// use parquet::arrow::arrow_reader::ArrowReaderOptions;
32///
33/// // Set arrow options to skip encoding statistics for all columns.
34/// let options =
35/// ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
36///
37/// // Set serialized reader options to decode encoding statistics for all columns.
38/// let options =
39/// ReadOptionsBuilder::new().with_encoding_stats_policy(ParquetStatisticsPolicy::KeepAll)
40/// .build();
41///
42/// // Set arrow options to skip encoding statistics for all columns, but to decode statistics
43/// // for columns 0 and 1.
44/// let options = ArrowReaderOptions::new()
45/// .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0, 1]));
46/// ```
47#[derive(Default, Debug, Clone)]
48pub enum ParquetStatisticsPolicy {
49 /// Decode the relevant statistics for all columns.
50 #[default]
51 KeepAll,
52 /// Skip decoding the relevant statistics for all columns.
53 SkipAll,
54 /// Skip decoding the relevant statistics for all columns not in the provided set
55 /// of column indices.
56 SkipExcept(Arc<HashSet<usize>>),
57}
58
59impl ParquetStatisticsPolicy {
60 /// Create a `ParquetStatisticsPolicy` to skip all columns except those in `keep`.
61 ///
62 /// If `keep` is empty, then this returns [`Self::SkipAll`]
63 pub fn skip_except(keep: &[usize]) -> Self {
64 if keep.is_empty() {
65 Self::SkipAll
66 } else {
67 let mut keep_set = HashSet::<usize>::with_capacity(keep.len());
68 keep_set.extend(keep.iter());
69 Self::SkipExcept(Arc::new(keep_set))
70 }
71 }
72
73 /// Returns whether the policy for the given column index is to skip the statistics.
74 pub(crate) fn is_skip(&self, col_index: usize) -> bool {
75 match self {
76 Self::KeepAll => false,
77 Self::SkipAll => true,
78 Self::SkipExcept(keep) => !keep.contains(&col_index),
79 }
80 }
81}
82
83/// Options that can be set to control what parts of the Parquet file footer
84/// metadata will be decoded and made present in the [`ParquetMetaData`] returned
85/// by [`ParquetMetaDataReader`] and [`ParquetMetaDataPushDecoder`].
86///
87/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
88/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
89/// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder
90#[derive(Debug, Clone)]
91pub struct ParquetMetaDataOptions {
92 schema_descr: Option<SchemaDescPtr>,
93 encoding_stats_as_mask: bool,
94 encoding_stats_policy: ParquetStatisticsPolicy,
95 column_stats_policy: ParquetStatisticsPolicy,
96 size_stats_policy: ParquetStatisticsPolicy,
97}
98
99impl Default for ParquetMetaDataOptions {
100 fn default() -> Self {
101 Self {
102 schema_descr: None,
103 encoding_stats_as_mask: true,
104 encoding_stats_policy: ParquetStatisticsPolicy::KeepAll,
105 column_stats_policy: ParquetStatisticsPolicy::KeepAll,
106 size_stats_policy: ParquetStatisticsPolicy::KeepAll,
107 }
108 }
109}
110
111impl ParquetMetaDataOptions {
112 /// Return a new default [`ParquetMetaDataOptions`].
113 pub fn new() -> Self {
114 Default::default()
115 }
116
117 /// Returns an optional [`SchemaDescPtr`] to use when decoding. If this is not `None` then
118 /// the schema in the footer will be skipped.
119 pub fn schema(&self) -> Option<&SchemaDescPtr> {
120 self.schema_descr.as_ref()
121 }
122
123 /// Provide a schema to use when decoding the metadata.
124 pub fn set_schema(&mut self, val: SchemaDescPtr) {
125 self.schema_descr = Some(val);
126 }
127
128 /// Call [`Self::set_schema`] and return `Self` for chaining.
129 pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
130 self.set_schema(val);
131 self
132 }
133
134 /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData`
135 /// as a bitmask (defaults to `true`).
136 ///
137 /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
138 /// might be desirable.
139 ///
140 /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
141 /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
142 /// [`encoding_stats`]:
143 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
144 pub fn encoding_stats_as_mask(&self) -> bool {
145 self.encoding_stats_as_mask
146 }
147
148 /// Convert [`encoding_stats`] from a vector of [`PageEncodingStats`] to a bitmask. This can
149 /// speed up metadata decoding while still enabling some use cases served by the full stats.
150 ///
151 /// Note that if for a given column both this option and `skip_encoding_stats` are `true`, the
152 /// stats will be skipped and not be returned as a mask.
153 ///
154 /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more information.
155 ///
156 /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats
157 /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
158 /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
159 /// [`encoding_stats`]:
160 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
161 pub fn set_encoding_stats_as_mask(&mut self, val: bool) {
162 self.encoding_stats_as_mask = val;
163 }
164
165 /// Call [`Self::set_encoding_stats_as_mask`] and return `Self` for chaining.
166 pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
167 self.set_encoding_stats_as_mask(val);
168 self
169 }
170
171 /// Returns whether to skip decoding the [`encoding_stats`] in the Parquet `ColumnMetaData`
172 /// for the column indexed by `col_index`.
173 ///
174 /// [`encoding_stats`]:
175 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
176 pub fn skip_encoding_stats(&self, col_index: usize) -> bool {
177 self.encoding_stats_policy.is_skip(col_index)
178 }
179
180 /// Sets the decoding policy for [`encoding_stats`] in the Parquet `ColumnMetaData`.
181 ///
182 /// The default policy is to decode all `encoding_stats`.
183 ///
184 /// This option takes precedence over [`Self::encoding_stats_as_mask`].
185 ///
186 /// [`encoding_stats`]:
187 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
188 pub fn set_encoding_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
189 self.encoding_stats_policy = policy;
190 }
191
192 /// Call [`Self::set_encoding_stats_policy`] and return `Self` for chaining.
193 pub fn with_encoding_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
194 self.set_encoding_stats_policy(policy);
195 self
196 }
197
198 /// Returns whether to skip decoding the [`statistics`] in the Parquet `ColumnMetaData`
199 /// for the column indexed by `col_index`.
200 ///
201 /// [`statistics`]:
202 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912
203 pub fn skip_column_stats(&self, col_index: usize) -> bool {
204 self.column_stats_policy.is_skip(col_index)
205 }
206
207 /// Sets the decoding policy for [`statistics`] in the Parquet `ColumnMetaData`.
208 ///
209 /// The default policy is to decode all `statistics`.
210 ///
211 /// [`statistics`]:
212 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912
213 pub fn set_column_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
214 self.column_stats_policy = policy;
215 }
216
217 /// Call [`Self::set_column_stats_policy`] and return `Self` for chaining.
218 pub fn with_column_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
219 self.set_column_stats_policy(policy);
220 self
221 }
222
223 /// Returns whether to skip decoding the [`size_statistics`] in the Parquet `ColumnMetaData`
224 /// for the column indexed by `col_index`.
225 ///
226 /// [`size_statistics`]:
227 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936
228 pub fn skip_size_stats(&self, col_index: usize) -> bool {
229 self.size_stats_policy.is_skip(col_index)
230 }
231
232 /// Sets the decoding policy for [`size_statistics`] in the Parquet `ColumnMetaData`.
233 ///
234 /// The default policy is to decode all `size_statistics`.
235 ///
236 /// [`size_statistics`]:
237 /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936
238 pub fn set_size_stats_policy(&mut self, policy: ParquetStatisticsPolicy) {
239 self.size_stats_policy = policy;
240 }
241
242 /// Call [`Self::set_size_stats_policy`] and return `Self` for chaining.
243 pub fn with_size_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self {
244 self.set_size_stats_policy(policy);
245 self
246 }
247}
248
249#[cfg(test)]
250mod tests {
251 use bytes::Bytes;
252
253 use crate::{
254 DecodeResult,
255 file::metadata::{ParquetMetaDataOptions, ParquetMetaDataPushDecoder},
256 util::test_common::file_util::get_test_file,
257 };
258 use std::{io::Read, sync::Arc};
259
260 #[test]
261 fn test_options_default() {
262 let options = ParquetMetaDataOptions::default();
263 assert!(options.encoding_stats_as_mask());
264 }
265
266 #[test]
267 fn test_provide_schema() {
268 let mut buf: Vec<u8> = Vec::new();
269 get_test_file("alltypes_plain.parquet")
270 .read_to_end(&mut buf)
271 .unwrap();
272
273 let data = Bytes::from(buf);
274 let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap();
275 decoder
276 .push_range(0..data.len() as u64, data.clone())
277 .unwrap();
278
279 let expected = match decoder.try_decode().unwrap() {
280 DecodeResult::Data(m) => m,
281 _ => panic!("could not parse metadata"),
282 };
283 let expected_schema = expected.file_metadata().schema_descr_ptr();
284
285 let mut options = ParquetMetaDataOptions::new();
286 options.set_schema(expected_schema);
287 let options = Arc::new(options);
288
289 let mut decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64)
290 .unwrap()
291 .with_metadata_options(Some(options));
292 decoder.push_range(0..data.len() as u64, data).unwrap();
293 let metadata = match decoder.try_decode().unwrap() {
294 DecodeResult::Data(m) => m,
295 _ => panic!("could not parse metadata"),
296 };
297
298 assert_eq!(expected, metadata);
299 // the schema pointers should be the same
300 assert!(Arc::ptr_eq(
301 &expected.file_metadata().schema_descr_ptr(),
302 &metadata.file_metadata().schema_descr_ptr()
303 ));
304 }
305}